|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +import seaborn as sns |
| 5 | + |
| 6 | +from sklearn.model_selection import train_test_split |
| 7 | +from sklearn.metrics import mean_absolute_error,r2_score |
| 8 | +from xgboost import XGBRegressor |
| 9 | +import optuna |
| 10 | +import shap |
| 11 | +from sklearn.preprocessing import LabelEncoder |
| 12 | +from sklearn.preprocessing import MinMaxScaler, StandardScaler |
| 13 | +plt.style.use("ggplot") |
| 14 | +import warnings |
| 15 | +warnings.filterwarnings("ignore") |
| 16 | + |
| 17 | +df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv',encoding='ISO-8859-1').drop(columns='Name') |
| 18 | +df.dropna(inplace=True) |
| 19 | +df.head() |
| 20 | + |
| 21 | +print(df.shape) |
| 22 | +print("-"*60) |
| 23 | +print(df.isna().sum()) |
| 24 | +print("-"*60) |
| 25 | +print(df.duplicated().sum()) |
| 26 | +print("-"*60) |
| 27 | +df.info() |
| 28 | + |
| 29 | +df.nunique() |
| 30 | + |
| 31 | +# Function to plot pie chart |
| 32 | +def plot_pie_chart(column): |
| 33 | + plt.figure(figsize=(8, 8)) |
| 34 | + df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3)) |
| 35 | + plt.title(f'Distribution of {column}') |
| 36 | + plt.ylabel('') |
| 37 | + plt.show() |
| 38 | + |
| 39 | + |
| 40 | +def plot_hist_chart(column): |
| 41 | + plt.figure(figsize=(10, 5)) |
| 42 | + df[column].hist() |
| 43 | + plt.title(f'{column} Distribution') |
| 44 | + plt.xlabel('Index') |
| 45 | + plt.ylabel(column) |
| 46 | + plt.show() |
| 47 | + |
| 48 | +# Plotting for each column |
| 49 | +for col in df.columns: |
| 50 | + if df[col].dtype == 'object': |
| 51 | + plot_pie_chart(col) |
| 52 | + else: |
| 53 | + plot_hist_chart(col) |
| 54 | +# Function to plot pie chart |
| 55 | +def plot_pie_chart(column): |
| 56 | + plt.figure(figsize=(8, 8)) |
| 57 | + df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3)) |
| 58 | + plt.title(f'Distribution of {column}') |
| 59 | + plt.ylabel('') |
| 60 | + plt.show() |
| 61 | + |
| 62 | + |
| 63 | +def plot_hist_chart(column): |
| 64 | + plt.figure(figsize=(10, 5)) |
| 65 | + df[column].hist() |
| 66 | + plt.title(f'{column} Distribution') |
| 67 | + plt.xlabel('Index') |
| 68 | + plt.ylabel(column) |
| 69 | + plt.show() |
| 70 | + |
| 71 | +# Plotting for each column |
| 72 | +for col in df.columns: |
| 73 | + if df[col].dtype == 'object': |
| 74 | + plot_pie_chart(col) |
| 75 | + else: |
| 76 | + plot_hist_chart(col) |
| 77 | + |
| 78 | +director_avg_rating = {} |
| 79 | +for index, row in df.iterrows(): |
| 80 | + director = row['Director'] |
| 81 | + rating = row['Rating'] |
| 82 | + if director in director_avg_rating: |
| 83 | + director_avg_rating[director]['sum'] += rating |
| 84 | + director_avg_rating[director]['count'] += 1 |
| 85 | + else: |
| 86 | + director_avg_rating[director] = {'sum': rating, 'count': 1} |
| 87 | + |
| 88 | +df['Director Average Rating'] = df['Director'].apply(lambda x: director_avg_rating[x]['sum'] / director_avg_rating[x]['count']) |
| 89 | + |
| 90 | +# add a column ''ead actor average rating' |
| 91 | +actor_avg_rating = {} |
| 92 | +for index, row in df.iterrows(): |
| 93 | + actors = row['Actor 1'].split(', ') |
| 94 | + rating = row['Rating'] |
| 95 | + for actor in actors: |
| 96 | + if actor in actor_avg_rating: |
| 97 | + actor_avg_rating[actor]['sum'] += rating |
| 98 | + actor_avg_rating[actor]['count'] += 1 |
| 99 | + else: |
| 100 | + actor_avg_rating[actor] = {'sum': rating, 'count': 1} |
| 101 | + |
| 102 | +def calculate_lead_actor_average(row): |
| 103 | + actors = row['Actor 1'].split(', ') |
| 104 | + lead_actor_ratings = [actor_avg_rating[actor]['sum'] / actor_avg_rating[actor]['count'] for actor in actors] |
| 105 | + return max(lead_actor_ratings) |
| 106 | + |
| 107 | +df['Lead Actor Average Rating'] = df.apply(calculate_lead_actor_average, axis=1) |
| 108 | + |
| 109 | +df['Genre1'] = df.Genre.str.split(',',expand=True)[0] |
| 110 | +df['Genre2'] = df.Genre.str.split(',',expand=True)[1] |
| 111 | +df['Genre3'] = df.Genre.str.split(',',expand=True)[2] |
| 112 | + |
| 113 | +df = df.drop(columns=['Genre','Director','Actor 1','Actor 2','Actor 3']) |
| 114 | + |
| 115 | +df.shape[0] |
| 116 | + |
| 117 | +print(df.shape) |
| 118 | +print(df.isna().sum()) |
| 119 | +df = df.fillna(0) |
| 120 | + |
| 121 | +for i in df.index: |
| 122 | + if df.at[i, 'Genre2'] == 0: |
| 123 | + df.at[i, 'Genre2'] = df.at[i, 'Genre1'] |
| 124 | + elif df.at[i, 'Genre3'] == 0: |
| 125 | + df.at[i, 'Genre3'] = df.at[i, 'Genre2'] |
| 126 | +print(df.isna().sum()) |
| 127 | +df.info() |
| 128 | +for col in ['Genre1', 'Genre2', 'Genre3']: |
| 129 | + df[col], _ = pd.factorize(df[col]) |
| 130 | + |
| 131 | +df.head() |
| 132 | +target = 'Rating' |
| 133 | +X = df.drop(columns=[target,'Duration']) |
| 134 | +y = df[target] |
| 135 | +scaler = MinMaxScaler() |
| 136 | +X = scaler.fit_transform(X) |
| 137 | +X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) |
| 138 | +# Define objective function for Optuna |
| 139 | +def objective(trial): |
| 140 | + # Define hyperparameters to search |
| 141 | + param = { |
| 142 | + 'n_estimators': trial.suggest_int('n_estimators', 300, 1000), |
| 143 | + 'max_depth': trial.suggest_int('max_depth', 3, 10), |
| 144 | + 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), |
| 145 | + 'subsample': trial.suggest_float('subsample', 0.6, 1.0), |
| 146 | + 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), |
| 147 | + 'reg_alpha': trial.suggest_float('reg_alpha', 0, 10), |
| 148 | + 'reg_lambda': trial.suggest_float('reg_lambda', 0, 10), |
| 149 | + 'verbosity': 0, |
| 150 | + } |
| 151 | + |
| 152 | + xgb = XGBRegressor(**param) |
| 153 | + |
| 154 | + # Fit the model on training data |
| 155 | + xgb.fit(X_train, y_train) |
| 156 | + |
| 157 | + # Predict on the validation set |
| 158 | + y_pred = xgb.predict(X_test) |
| 159 | + |
| 160 | + r2 = r2_score(y_test, y_pred) |
| 161 | + |
| 162 | + return r2 |
| 163 | + |
| 164 | +# Perform hyperparameter optimization using Optuna |
| 165 | +study = optuna.create_study(direction='maximize') |
| 166 | +study.optimize(objective, n_trials=50) |
| 167 | + |
| 168 | +# Print the best trial and parameters found |
| 169 | +print("Best trial:") |
| 170 | +best_trial = study.best_trial |
| 171 | +print(f" Value: {best_trial.value}") |
| 172 | +print(" Params: ") |
| 173 | +for key, value in best_trial.params.items(): |
| 174 | + print(f" {key}: {value}") |
| 175 | + |
| 176 | +# Use the best parameters to train the final model |
| 177 | +best_params = best_trial.params |
| 178 | + |
| 179 | +# Define objective function for Optuna |
| 180 | +def objective(trial): |
| 181 | + # Define hyperparameters to search |
| 182 | + param = { |
| 183 | + 'n_estimators': trial.suggest_int('n_estimators', 300, 1000), |
| 184 | + 'max_depth': trial.suggest_int('max_depth', 3, 10), |
| 185 | + 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), |
| 186 | + 'subsample': trial.suggest_float('subsample', 0.6, 1.0), |
| 187 | + 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), |
| 188 | + 'reg_alpha': trial.suggest_float('reg_alpha', 0, 10), |
| 189 | + 'reg_lambda': trial.suggest_float('reg_lambda', 0, 10), |
| 190 | + 'verbosity': 0, |
| 191 | + } |
| 192 | + |
| 193 | + xgb = XGBRegressor(**param) |
| 194 | + |
| 195 | + # Fit the model on training data |
| 196 | + xgb.fit(X_train, y_train) |
| 197 | + |
| 198 | + # Predict on the validation set |
| 199 | + y_pred = xgb.predict(X_test) |
| 200 | + |
| 201 | + r2 = r2_score(y_test, y_pred) |
| 202 | + |
| 203 | + return r2 |
| 204 | + |
| 205 | +# Perform hyperparameter optimization using Optuna |
| 206 | +study = optuna.create_study(direction='maximize') |
| 207 | +study.optimize(objective, n_trials=50) |
| 208 | + |
| 209 | +# Print the best trial and parameters found |
| 210 | +print("Best trial:") |
| 211 | +best_trial = study.best_trial |
| 212 | +print(f" Value: {best_trial.value}") |
| 213 | +print(" Params: ") |
| 214 | +for key, value in best_trial.params.items(): |
| 215 | + print(f" {key}: {value}") |
| 216 | + |
| 217 | +# Use the best parameters to train the final model |
| 218 | +best_params = best_trial.params |
| 219 | + |
| 220 | +xgb_normal = XGBRegressor(**best_params) |
| 221 | + |
| 222 | + |
| 223 | +xgb_normal.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test,y_test)],verbose=0) |
| 224 | + |
| 225 | +# Make predictions on the test set |
| 226 | +y_pred_test = xgb_normal.predict(X_test) |
| 227 | + |
| 228 | +mae = mean_absolute_error(y_test,y_pred_test) |
| 229 | + |
| 230 | + |
| 231 | +print("Test MAE:",mae ) |
| 232 | + |
| 233 | +results = xgb_normal.evals_result() |
| 234 | +val_rmse = results["validation_1"]['rmse'] |
| 235 | +best_epopch = min(val_rmse) |
| 236 | +i_best_epoch = val_rmse.index(best_epopch) |
| 237 | +epochs = len(results['validation_0']['rmse']) |
| 238 | +x_axis = range(0, epochs) |
| 239 | + |
| 240 | +# plot m log loss |
| 241 | +fig, ax = plt.subplots() |
| 242 | +ax.plot(x_axis, results['validation_0']['rmse'], label='Train') |
| 243 | +ax.plot(x_axis, results['validation_1']['rmse'], label='Test') |
| 244 | +ax.plot(i_best_epoch, best_epopch, marker="o", color="green", label="Best") |
| 245 | +ax.legend() |
| 246 | +plt.ylabel('rmse') |
| 247 | +plt.title('XGBoost rmse') |
| 248 | +plt.show() |
| 249 | + |
| 250 | +plt.scatter(y_test, y_pred_test, alpha=0.7, label='Real') |
| 251 | +plt.plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()], '--', c='.3') |
| 252 | +plt.xlabel('Actual Values') |
| 253 | +plt.ylabel('Predicted Values') |
| 254 | +plt.title('Scatter Plot XGBoost Model') |
| 255 | +plt.show() |
| 256 | + |
| 257 | +residuals = y_test - y_pred_test |
| 258 | +plt.scatter(y_test, residuals, label='Residuals', alpha=0.7) |
| 259 | +plt.axhline(y=0, color='red', linestyle='--', label='Zero Residuals') |
| 260 | +plt.xlabel('Actual Values') |
| 261 | +plt.ylabel('Residuals') |
| 262 | +plt.title('Residual Plot for XGBoost') |
| 263 | +plt.legend() |
| 264 | +plt.show() |
| 265 | + |
| 266 | +# Optianing the most features that had an impact of our price |
| 267 | +def plot_feature_importance(model, feature_names=None, plot=True): |
| 268 | + |
| 269 | + feature_importance = model.feature_importances_ |
| 270 | + |
| 271 | + if feature_names is None: |
| 272 | + feature_names = model.feature_name() |
| 273 | + |
| 274 | + feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}) |
| 275 | + |
| 276 | + feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False) |
| 277 | + |
| 278 | + if plot: |
| 279 | + plt.figure(figsize=(10, 10)) |
| 280 | + sns.barplot(x='Importance', y='Feature', data=feature_importance_df) |
| 281 | + plt.title('Feature Importance') |
| 282 | + plt.show() |
| 283 | + |
| 284 | + return feature_importance_df |
| 285 | + |
| 286 | +feature_importance_df = plot_feature_importance(xgb_normal,feature_names=df.drop(columns=[target,'Duration']).columns) |
0 commit comments