Predicting Product Sales from Product Features¶

1. Import libraries¶

In [22]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

2. Load the dataset¶

In [23]:
df = pd.read_csv(r"cleaned_ecommerce_sales_data.csv")

df.head()
Out[23]:
product_id product_name category price review_score review_count sales_month_1 sales_month_2 sales_month_3 sales_month_4 sales_month_5 sales_month_6 sales_month_7 sales_month_8 sales_month_9 sales_month_10 sales_month_11 sales_month_12 annual_sales avg_monthly_sales
0 1 Product_1 Clothing 190.40 1.7 220 479 449 92 784 604 904 446 603 807 252 695 306 6421 535.083333
1 2 Product_2 Home & Kitchen 475.60 3.2 903 21 989 861 863 524 128 610 436 176 294 772 353 6027 502.250000
2 3 Product_3 Toys 367.34 4.5 163 348 558 567 143 771 409 290 828 340 667 267 392 5580 465.000000
3 4 Product_4 Toys 301.34 3.9 951 725 678 59 15 937 421 670 933 56 157 168 203 5022 418.500000
4 5 Product_5 Books 82.23 4.2 220 682 451 649 301 620 293 411 258 854 548 770 257 6094 507.833333

3. Inspect and clean the data¶

In [24]:
df.shape
Out[24]:
(1000, 20)
In [25]:
df.columns.tolist()
Out[25]:
['product_id',
 'product_name',
 'category',
 'price',
 'review_score',
 'review_count',
 'sales_month_1',
 'sales_month_2',
 'sales_month_3',
 'sales_month_4',
 'sales_month_5',
 'sales_month_6',
 'sales_month_7',
 'sales_month_8',
 'sales_month_9',
 'sales_month_10',
 'sales_month_11',
 'sales_month_12',
 'annual_sales',
 'avg_monthly_sales']
In [26]:
df.dtypes
Out[26]:
product_id             int64
product_name          object
category              object
price                float64
review_score         float64
review_count           int64
sales_month_1          int64
sales_month_2          int64
sales_month_3          int64
sales_month_4          int64
sales_month_5          int64
sales_month_6          int64
sales_month_7          int64
sales_month_8          int64
sales_month_9          int64
sales_month_10         int64
sales_month_11         int64
sales_month_12         int64
annual_sales           int64
avg_monthly_sales    float64
dtype: object
In [27]:
df.isna().sum()
Out[27]:
product_id           0
product_name         0
category             0
price                0
review_score         0
review_count         0
sales_month_1        0
sales_month_2        0
sales_month_3        0
sales_month_4        0
sales_month_5        0
sales_month_6        0
sales_month_7        0
sales_month_8        0
sales_month_9        0
sales_month_10       0
sales_month_11       0
sales_month_12       0
annual_sales         0
avg_monthly_sales    0
dtype: int64

Data set has no duplicate data points

In [28]:
#Summary Statistics
print(df[["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]].describe())
             price  review_score  review_count  annual_sales  \
count  1000.000000   1000.000000   1000.000000   1000.000000   
mean    247.677130      3.027600    526.506000   6019.912000   
std     144.607983      1.171243    282.269932    992.273787   
min       7.290000      1.000000      1.000000   2972.000000   
25%     121.810000      2.000000    283.750000   5392.000000   
50%     250.920000      3.100000    543.000000   5992.000000   
75%     373.435000      4.000000    772.000000   6680.000000   
max     499.860000      5.000000    999.000000   9151.000000   

       avg_monthly_sales  
count        1000.000000  
mean          501.659333  
std            82.689482  
min           247.666667  
25%           449.333333  
50%           499.333333  
75%           556.666667  
max           762.583333  

4. Exploratory data analysis¶

Checking for correlation

In [29]:
corr_cols = ["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]
corr_matrix = df[corr_cols].corr(numeric_only=True)
corr_matrix
Out[29]:
price review_score review_count annual_sales avg_monthly_sales
price 1.000000 0.028960 0.042189 -0.015978 -0.015978
review_score 0.028960 1.000000 0.027351 -0.018186 -0.018186
review_count 0.042189 0.027351 1.000000 -0.069393 -0.069393
annual_sales -0.015978 -0.018186 -0.069393 1.000000 1.000000
avg_monthly_sales -0.015978 -0.018186 -0.069393 1.000000 1.000000
In [30]:
plt.figure(figsize=(8, 5))
plt.hist(df["annual_sales"], bins=30, edgecolor="black")
plt.title("Distribution of Annual Sales")
plt.xlabel("Annual Sales")
plt.ylabel("Number of Products")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [31]:
plt.figure(figsize=(8, 5))
plt.scatter(df["review_count"], df["annual_sales"], alpha=0.7)
plt.title("Review Count vs Annual Sales")
plt.xlabel("Review Count")
plt.ylabel("Annual Sales")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [32]:
plt.figure(figsize=(7, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()
No description has been provided for this image

5. Prepare for machine learning¶

The target variable is annual_sales. The predictor variables are price, review_score, review_count, and category.

In [33]:
feature_cols = ["price", "review_score", "review_count", "category"]
target_col = "annual_sales"

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), ["price", "review_score", "review_count"]),
        ("cat", Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), ["category"]),
    ]
)

6. Train and evaluate machine learning models¶

In [ ]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
    ),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

results = []
fitted_pipelines = {}

for model_name, model in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    # Put results in a list
    results.append(
        {
            "model": model_name,
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2,
        }
    )

    fitted_pipelines[model_name] = pipeline

results_df = pd.DataFrame(results).sort_values("R2", ascending=False)
results_df
Out[ ]:
model MAE RMSE R2
0 Linear Regression 777.932688 956.896757 -0.006772
1 Random Forest 830.719017 1038.505538 -0.185820
2 Gradient Boosting 845.317601 1048.366438 -0.208446
In [ ]:
#Grab best result from list
best_model_name = results_df.iloc[0]["model"]
best_pipeline = fitted_pipelines[best_model_name]
best_preds = best_pipeline.predict(X_test)

print("Best model based on R2:", best_model_name)
print(results_df.iloc[0])
Best model based on R2: Linear Regression
model    Linear Regression
MAE             777.932688
RMSE            956.896757
R2               -0.006772
Name: 0, dtype: object
In [36]:
plt.figure(figsize=(7, 5))
plt.scatter(y_test, best_preds, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
plt.title(f"Actual vs Predicted Annual Sales ({best_model_name})")
plt.xlabel("Actual Annual Sales")
plt.ylabel("Predicted Annual Sales")
plt.tight_layout()
plt.show()
No description has been provided for this image

7. Interpret the best model¶

In [37]:
preprocessor = best_pipeline.named_steps["preprocessor"]
model = best_pipeline.named_steps["model"]
feature_names = preprocessor.get_feature_names_out()

if hasattr(model, "feature_importances_"):
    interpretation_df = pd.DataFrame(
        {"feature": feature_names, "importance": model.feature_importances_}
    ).sort_values("importance", ascending=False)
    interpretation_df.head(10)
elif hasattr(model, "coef_"):
    interpretation_df = pd.DataFrame(
        {"feature": feature_names, "coefficient": model.coef_}
    ).sort_values("coefficient", key=np.abs, ascending=False)
    interpretation_df.head(10)
else:
    interpretation_df = pd.DataFrame()
    interpretation_df
In [38]:
if "importance" in interpretation_df.columns:
    top10 = interpretation_df.head(10).sort_values("importance")
    plt.figure(figsize=(9, 5))
    plt.barh(top10["feature"], top10["importance"])
    plt.title(f"Top 10 Most Important Features ({best_model_name})")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()
elif "coefficient" in interpretation_df.columns:
    top10 = interpretation_df.head(10).sort_values("coefficient")
    plt.figure(figsize=(9, 5))
    plt.barh(top10["feature"], top10["coefficient"])
    plt.title(f"Top 10 Coefficients by Magnitude ({best_model_name})")
    plt.xlabel("Coefficient")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()
No description has been provided for this image

8. Conclusion¶

The models in this notebook did not predict annual sales well. Even the best model had an R² value below 0, which suggests that price, review score, review count, and category are weak predictors of annual sales in this dataset. This is still an important result because it shows that better sales prediction likely requires more useful features, such as advertising, discounts, seasonality, or brand strength.