import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

df = pd.read_csv(r"cleaned_ecommerce_sales_data.csv")

df.head()

df.shape

(1000, 20)

df.columns.tolist()

['product_id',
 'product_name',
 'category',
 'price',
 'review_score',
 'review_count',
 'sales_month_1',
 'sales_month_2',
 'sales_month_3',
 'sales_month_4',
 'sales_month_5',
 'sales_month_6',
 'sales_month_7',
 'sales_month_8',
 'sales_month_9',
 'sales_month_10',
 'sales_month_11',
 'sales_month_12',
 'annual_sales',
 'avg_monthly_sales']

df.dtypes

product_id             int64
product_name          object
category              object
price                float64
review_score         float64
review_count           int64
sales_month_1          int64
sales_month_2          int64
sales_month_3          int64
sales_month_4          int64
sales_month_5          int64
sales_month_6          int64
sales_month_7          int64
sales_month_8          int64
sales_month_9          int64
sales_month_10         int64
sales_month_11         int64
sales_month_12         int64
annual_sales           int64
avg_monthly_sales    float64
dtype: object

df.isna().sum()

product_id           0
product_name         0
category             0
price                0
review_score         0
review_count         0
sales_month_1        0
sales_month_2        0
sales_month_3        0
sales_month_4        0
sales_month_5        0
sales_month_6        0
sales_month_7        0
sales_month_8        0
sales_month_9        0
sales_month_10       0
sales_month_11       0
sales_month_12       0
annual_sales         0
avg_monthly_sales    0
dtype: int64

#Summary Statistics
print(df[["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]].describe())

             price  review_score  review_count  annual_sales  \
count  1000.000000   1000.000000   1000.000000   1000.000000   
mean    247.677130      3.027600    526.506000   6019.912000   
std     144.607983      1.171243    282.269932    992.273787   
min       7.290000      1.000000      1.000000   2972.000000   
25%     121.810000      2.000000    283.750000   5392.000000   
50%     250.920000      3.100000    543.000000   5992.000000   
75%     373.435000      4.000000    772.000000   6680.000000   
max     499.860000      5.000000    999.000000   9151.000000   

       avg_monthly_sales  
count        1000.000000  
mean          501.659333  
std            82.689482  
min           247.666667  
25%           449.333333  
50%           499.333333  
75%           556.666667  
max           762.583333

corr_cols = ["price", "review_score", "review_count", "annual_sales", "avg_monthly_sales"]
corr_matrix = df[corr_cols].corr(numeric_only=True)
corr_matrix

plt.figure(figsize=(8, 5))
plt.hist(df["annual_sales"], bins=30, edgecolor="black")
plt.title("Distribution of Annual Sales")
plt.xlabel("Annual Sales")
plt.ylabel("Number of Products")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 5))
plt.scatter(df["review_count"], df["annual_sales"], alpha=0.7)
plt.title("Review Count vs Annual Sales")
plt.xlabel("Review Count")
plt.ylabel("Annual Sales")
plt.tight_layout()
plt.show()

plt.figure(figsize=(7, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

feature_cols = ["price", "review_score", "review_count", "category"]
target_col = "annual_sales"

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), ["price", "review_score", "review_count"]),
        ("cat", Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), ["category"]),
    ]
)

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
    ),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

results = []
fitted_pipelines = {}

for model_name, model in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    # Put results in a list
    results.append(
        {
            "model": model_name,
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2,
        }
    )

    fitted_pipelines[model_name] = pipeline

results_df = pd.DataFrame(results).sort_values("R2", ascending=False)
results_df

#Grab best result from list
best_model_name = results_df.iloc[0]["model"]
best_pipeline = fitted_pipelines[best_model_name]
best_preds = best_pipeline.predict(X_test)

print("Best model based on R2:", best_model_name)
print(results_df.iloc[0])

Best model based on R2: Linear Regression
model    Linear Regression
MAE             777.932688
RMSE            956.896757
R2               -0.006772
Name: 0, dtype: object

plt.figure(figsize=(7, 5))
plt.scatter(y_test, best_preds, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
plt.title(f"Actual vs Predicted Annual Sales ({best_model_name})")
plt.xlabel("Actual Annual Sales")
plt.ylabel("Predicted Annual Sales")
plt.tight_layout()
plt.show()

preprocessor = best_pipeline.named_steps["preprocessor"]
model = best_pipeline.named_steps["model"]
feature_names = preprocessor.get_feature_names_out()

if hasattr(model, "feature_importances_"):
    interpretation_df = pd.DataFrame(
        {"feature": feature_names, "importance": model.feature_importances_}
    ).sort_values("importance", ascending=False)
    interpretation_df.head(10)
elif hasattr(model, "coef_"):
    interpretation_df = pd.DataFrame(
        {"feature": feature_names, "coefficient": model.coef_}
    ).sort_values("coefficient", key=np.abs, ascending=False)
    interpretation_df.head(10)
else:
    interpretation_df = pd.DataFrame()
    interpretation_df

if "importance" in interpretation_df.columns:
    top10 = interpretation_df.head(10).sort_values("importance")
    plt.figure(figsize=(9, 5))
    plt.barh(top10["feature"], top10["importance"])
    plt.title(f"Top 10 Most Important Features ({best_model_name})")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()
elif "coefficient" in interpretation_df.columns:
    top10 = interpretation_df.head(10).sort_values("coefficient")
    plt.figure(figsize=(9, 5))
    plt.barh(top10["feature"], top10["coefficient"])
    plt.title(f"Top 10 Coefficients by Magnitude ({best_model_name})")
    plt.xlabel("Coefficient")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()

	product_id	product_name	category	price	review_score	review_count	sales_month_1	sales_month_2	sales_month_3	sales_month_4	sales_month_5	sales_month_6	sales_month_7	sales_month_8	sales_month_9	sales_month_10	sales_month_11	sales_month_12	annual_sales	avg_monthly_sales
0	1	Product_1	Clothing	190.40	1.7	220	479	449	92	784	604	904	446	603	807	252	695	306	6421	535.083333
1	2	Product_2	Home & Kitchen	475.60	3.2	903	21	989	861	863	524	128	610	436	176	294	772	353	6027	502.250000
2	3	Product_3	Toys	367.34	4.5	163	348	558	567	143	771	409	290	828	340	667	267	392	5580	465.000000
3	4	Product_4	Toys	301.34	3.9	951	725	678	59	15	937	421	670	933	56	157	168	203	5022	418.500000
4	5	Product_5	Books	82.23	4.2	220	682	451	649	301	620	293	411	258	854	548	770	257	6094	507.833333

	price	review_score	review_count	annual_sales	avg_monthly_sales
price	1.000000	0.028960	0.042189	-0.015978	-0.015978
review_score	0.028960	1.000000	0.027351	-0.018186	-0.018186
review_count	0.042189	0.027351	1.000000	-0.069393	-0.069393
annual_sales	-0.015978	-0.018186	-0.069393	1.000000	1.000000
avg_monthly_sales	-0.015978	-0.018186	-0.069393	1.000000	1.000000

Predicting Product Sales from Product Features¶

1. Import libraries¶

2. Load the dataset¶

3. Inspect and clean the data¶

4. Exploratory data analysis¶

5. Prepare for machine learning¶

6. Train and evaluate machine learning models¶

7. Interpret the best model¶

8. Conclusion¶

	model	MAE	RMSE	R2
0	Linear Regression	777.932688	956.896757	-0.006772
1	Random Forest	830.719017	1038.505538	-0.185820
2	Gradient Boosting	845.317601	1048.366438	-0.208446