Create file.py

aman-coder03 · web-flow · commit 6c9e810fae0d · 2024-07-02T15:19:42.000+05:30
diff --git a/file.py b/file.py
@@ -0,0 +1,286 @@
+import numpy as np 
+import pandas as pd 
+import matplotlib.pyplot as plt 
+import seaborn as sns
+
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error,r2_score
+from xgboost import XGBRegressor
+import optuna
+import shap
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+plt.style.use("ggplot")
+import warnings
+warnings.filterwarnings("ignore")
+
+df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv',encoding='ISO-8859-1').drop(columns='Name')
+df.dropna(inplace=True)
+df.head()
+
+print(df.shape)
+print("-"*60)
+print(df.isna().sum())
+print("-"*60)
+print(df.duplicated().sum())
+print("-"*60)
+df.info()
+
+df.nunique()
+
+# Function to plot pie chart
+def plot_pie_chart(column):
+    plt.figure(figsize=(8, 8))
+    df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3))
+    plt.title(f'Distribution of {column}')
+    plt.ylabel('')
+    plt.show()
+    
+    
+def plot_hist_chart(column):
+    plt.figure(figsize=(10, 5))
+    df[column].hist()
+    plt.title(f'{column} Distribution')
+    plt.xlabel('Index')
+    plt.ylabel(column)
+    plt.show()
+
+# Plotting for each column
+for col in df.columns:
+    if df[col].dtype == 'object':
+        plot_pie_chart(col)
+    else:
+        plot_hist_chart(col)
+# Function to plot pie chart
+def plot_pie_chart(column):
+    plt.figure(figsize=(8, 8))
+    df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3))
+    plt.title(f'Distribution of {column}')
+    plt.ylabel('')
+    plt.show()
+    
+    
+def plot_hist_chart(column):
+    plt.figure(figsize=(10, 5))
+    df[column].hist()
+    plt.title(f'{column} Distribution')
+    plt.xlabel('Index')
+    plt.ylabel(column)
+    plt.show()
+
+# Plotting for each column
+for col in df.columns:
+    if df[col].dtype == 'object':
+        plot_pie_chart(col)
+    else:
+        plot_hist_chart(col)
+
+director_avg_rating = {}
+for index, row in df.iterrows():
+    director = row['Director']
+    rating = row['Rating']
+    if director in director_avg_rating:
+        director_avg_rating[director]['sum'] += rating
+        director_avg_rating[director]['count'] += 1
+    else:
+        director_avg_rating[director] = {'sum': rating, 'count': 1}
+
+df['Director Average Rating'] = df['Director'].apply(lambda x: director_avg_rating[x]['sum'] / director_avg_rating[x]['count'])
+
+# add a column ''ead actor average rating'
+actor_avg_rating = {}
+for index, row in df.iterrows():
+    actors = row['Actor 1'].split(', ')
+    rating = row['Rating']
+    for actor in actors:
+        if actor in actor_avg_rating:
+            actor_avg_rating[actor]['sum'] += rating
+            actor_avg_rating[actor]['count'] += 1
+        else:
+            actor_avg_rating[actor] = {'sum': rating, 'count': 1}
+            
+def calculate_lead_actor_average(row):
+    actors = row['Actor 1'].split(', ')
+    lead_actor_ratings = [actor_avg_rating[actor]['sum'] / actor_avg_rating[actor]['count'] for actor in actors]
+    return max(lead_actor_ratings)
+
+df['Lead Actor Average Rating'] = df.apply(calculate_lead_actor_average, axis=1)
+
+df['Genre1'] = df.Genre.str.split(',',expand=True)[0]
+df['Genre2'] = df.Genre.str.split(',',expand=True)[1]
+df['Genre3'] = df.Genre.str.split(',',expand=True)[2]
+
+df = df.drop(columns=['Genre','Director','Actor 1','Actor 2','Actor 3'])
+
+df.shape[0]
+
+print(df.shape)
+print(df.isna().sum())
+df = df.fillna(0)
+
+for i in df.index:
+    if df.at[i, 'Genre2'] == 0:
+        df.at[i, 'Genre2'] = df.at[i, 'Genre1']
+    elif df.at[i, 'Genre3'] == 0:
+        df.at[i, 'Genre3'] = df.at[i, 'Genre2']
+print(df.isna().sum())
+df.info()
+for col in ['Genre1', 'Genre2', 'Genre3']:
+    df[col], _ = pd.factorize(df[col])
+
+df.head()
+target = 'Rating'
+X = df.drop(columns=[target,'Duration'])
+y = df[target]
+scaler = MinMaxScaler()
+X = scaler.fit_transform(X)
+X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
+# Define objective function for Optuna
+def objective(trial):
+    # Define hyperparameters to search
+    param = {
+        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
+        'max_depth': trial.suggest_int('max_depth', 3, 10),
+        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
+        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
+        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
+        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
+        'verbosity': 0,
+    }
+
+    xgb = XGBRegressor(**param)
+
+    # Fit the model on training data
+    xgb.fit(X_train, y_train)
+
+    # Predict on the validation set
+    y_pred = xgb.predict(X_test)
+
+    r2 = r2_score(y_test, y_pred)
+
+    return r2
+
+# Perform hyperparameter optimization using Optuna
+study = optuna.create_study(direction='maximize')
+study.optimize(objective, n_trials=50)
+
+# Print the best trial and parameters found
+print("Best trial:")
+best_trial = study.best_trial
+print(f"  Value: {best_trial.value}")
+print("  Params: ")
+for key, value in best_trial.params.items():
+    print(f"    {key}: {value}")
+    
+# Use the best parameters to train the final model
+best_params = best_trial.params
+
+# Define objective function for Optuna
+def objective(trial):
+    # Define hyperparameters to search
+    param = {
+        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
+        'max_depth': trial.suggest_int('max_depth', 3, 10),
+        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
+        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
+        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
+        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
+        'verbosity': 0,
+    }
+
+    xgb = XGBRegressor(**param)
+
+    # Fit the model on training data
+    xgb.fit(X_train, y_train)
+
+    # Predict on the validation set
+    y_pred = xgb.predict(X_test)
+
+    r2 = r2_score(y_test, y_pred)
+
+    return r2
+
+# Perform hyperparameter optimization using Optuna
+study = optuna.create_study(direction='maximize')
+study.optimize(objective, n_trials=50)
+
+# Print the best trial and parameters found
+print("Best trial:")
+best_trial = study.best_trial
+print(f"  Value: {best_trial.value}")
+print("  Params: ")
+for key, value in best_trial.params.items():
+    print(f"    {key}: {value}")
+    
+# Use the best parameters to train the final model
+best_params = best_trial.params
+
+xgb_normal = XGBRegressor(**best_params)
+
+
+xgb_normal.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test,y_test)],verbose=0)
+
+# Make predictions on the test set
+y_pred_test = xgb_normal.predict(X_test)
+
+mae = mean_absolute_error(y_test,y_pred_test)
+
+
+print("Test MAE:",mae )
+
+results = xgb_normal.evals_result()
+val_rmse = results["validation_1"]['rmse']
+best_epopch = min(val_rmse)
+i_best_epoch = val_rmse.index(best_epopch)
+epochs = len(results['validation_0']['rmse'])
+x_axis = range(0, epochs)
+    
+# plot m log loss
+fig, ax = plt.subplots()
+ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
+ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
+ax.plot(i_best_epoch, best_epopch, marker="o", color="green", label="Best")
+ax.legend()
+plt.ylabel('rmse')
+plt.title('XGBoost rmse')
+plt.show()
+
+plt.scatter(y_test, y_pred_test, alpha=0.7, label='Real')
+plt.plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()], '--', c='.3')
+plt.xlabel('Actual Values')
+plt.ylabel('Predicted Values')
+plt.title('Scatter Plot XGBoost Model')
+plt.show()
+
+residuals = y_test - y_pred_test
+plt.scatter(y_test, residuals, label='Residuals', alpha=0.7)
+plt.axhline(y=0, color='red', linestyle='--', label='Zero Residuals')
+plt.xlabel('Actual Values')
+plt.ylabel('Residuals')
+plt.title('Residual Plot for XGBoost')
+plt.legend()
+plt.show()
+
+# Optianing the most features that had an impact of our price
+def plot_feature_importance(model, feature_names=None, plot=True):
+
+    feature_importance = model.feature_importances_
+    
+    if feature_names is None:
+        feature_names = model.feature_name()
+
+    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
+
+    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
+
+    if plot:
+        plt.figure(figsize=(10, 10))
+        sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
+        plt.title('Feature Importance')
+        plt.show()
+
+    return feature_importance_df
+
+feature_importance_df = plot_feature_importance(xgb_normal,feature_names=df.drop(columns=[target,'Duration']).columns)