Skip to content

Commit 6c9e810

Browse files
authored
Create file.py
0 parents  commit 6c9e810

File tree

1 file changed

+286
-0
lines changed

1 file changed

+286
-0
lines changed

file.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.metrics import mean_absolute_error,r2_score
8+
from xgboost import XGBRegressor
9+
import optuna
10+
import shap
11+
from sklearn.preprocessing import LabelEncoder
12+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
13+
plt.style.use("ggplot")
14+
import warnings
15+
warnings.filterwarnings("ignore")
16+
17+
df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv',encoding='ISO-8859-1').drop(columns='Name')
18+
df.dropna(inplace=True)
19+
df.head()
20+
21+
print(df.shape)
22+
print("-"*60)
23+
print(df.isna().sum())
24+
print("-"*60)
25+
print(df.duplicated().sum())
26+
print("-"*60)
27+
df.info()
28+
29+
df.nunique()
30+
31+
# Function to plot pie chart
32+
def plot_pie_chart(column):
33+
plt.figure(figsize=(8, 8))
34+
df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3))
35+
plt.title(f'Distribution of {column}')
36+
plt.ylabel('')
37+
plt.show()
38+
39+
40+
def plot_hist_chart(column):
41+
plt.figure(figsize=(10, 5))
42+
df[column].hist()
43+
plt.title(f'{column} Distribution')
44+
plt.xlabel('Index')
45+
plt.ylabel(column)
46+
plt.show()
47+
48+
# Plotting for each column
49+
for col in df.columns:
50+
if df[col].dtype == 'object':
51+
plot_pie_chart(col)
52+
else:
53+
plot_hist_chart(col)
54+
# Function to plot pie chart
55+
def plot_pie_chart(column):
56+
plt.figure(figsize=(8, 8))
57+
df[column].value_counts().head(10).plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'), wedgeprops=dict(width=0.3))
58+
plt.title(f'Distribution of {column}')
59+
plt.ylabel('')
60+
plt.show()
61+
62+
63+
def plot_hist_chart(column):
64+
plt.figure(figsize=(10, 5))
65+
df[column].hist()
66+
plt.title(f'{column} Distribution')
67+
plt.xlabel('Index')
68+
plt.ylabel(column)
69+
plt.show()
70+
71+
# Plotting for each column
72+
for col in df.columns:
73+
if df[col].dtype == 'object':
74+
plot_pie_chart(col)
75+
else:
76+
plot_hist_chart(col)
77+
78+
director_avg_rating = {}
79+
for index, row in df.iterrows():
80+
director = row['Director']
81+
rating = row['Rating']
82+
if director in director_avg_rating:
83+
director_avg_rating[director]['sum'] += rating
84+
director_avg_rating[director]['count'] += 1
85+
else:
86+
director_avg_rating[director] = {'sum': rating, 'count': 1}
87+
88+
df['Director Average Rating'] = df['Director'].apply(lambda x: director_avg_rating[x]['sum'] / director_avg_rating[x]['count'])
89+
90+
# add a column ''ead actor average rating'
91+
actor_avg_rating = {}
92+
for index, row in df.iterrows():
93+
actors = row['Actor 1'].split(', ')
94+
rating = row['Rating']
95+
for actor in actors:
96+
if actor in actor_avg_rating:
97+
actor_avg_rating[actor]['sum'] += rating
98+
actor_avg_rating[actor]['count'] += 1
99+
else:
100+
actor_avg_rating[actor] = {'sum': rating, 'count': 1}
101+
102+
def calculate_lead_actor_average(row):
103+
actors = row['Actor 1'].split(', ')
104+
lead_actor_ratings = [actor_avg_rating[actor]['sum'] / actor_avg_rating[actor]['count'] for actor in actors]
105+
return max(lead_actor_ratings)
106+
107+
df['Lead Actor Average Rating'] = df.apply(calculate_lead_actor_average, axis=1)
108+
109+
df['Genre1'] = df.Genre.str.split(',',expand=True)[0]
110+
df['Genre2'] = df.Genre.str.split(',',expand=True)[1]
111+
df['Genre3'] = df.Genre.str.split(',',expand=True)[2]
112+
113+
df = df.drop(columns=['Genre','Director','Actor 1','Actor 2','Actor 3'])
114+
115+
df.shape[0]
116+
117+
print(df.shape)
118+
print(df.isna().sum())
119+
df = df.fillna(0)
120+
121+
for i in df.index:
122+
if df.at[i, 'Genre2'] == 0:
123+
df.at[i, 'Genre2'] = df.at[i, 'Genre1']
124+
elif df.at[i, 'Genre3'] == 0:
125+
df.at[i, 'Genre3'] = df.at[i, 'Genre2']
126+
print(df.isna().sum())
127+
df.info()
128+
for col in ['Genre1', 'Genre2', 'Genre3']:
129+
df[col], _ = pd.factorize(df[col])
130+
131+
df.head()
132+
target = 'Rating'
133+
X = df.drop(columns=[target,'Duration'])
134+
y = df[target]
135+
scaler = MinMaxScaler()
136+
X = scaler.fit_transform(X)
137+
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
138+
# Define objective function for Optuna
139+
def objective(trial):
140+
# Define hyperparameters to search
141+
param = {
142+
'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
143+
'max_depth': trial.suggest_int('max_depth', 3, 10),
144+
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
145+
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
146+
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
147+
'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
148+
'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
149+
'verbosity': 0,
150+
}
151+
152+
xgb = XGBRegressor(**param)
153+
154+
# Fit the model on training data
155+
xgb.fit(X_train, y_train)
156+
157+
# Predict on the validation set
158+
y_pred = xgb.predict(X_test)
159+
160+
r2 = r2_score(y_test, y_pred)
161+
162+
return r2
163+
164+
# Perform hyperparameter optimization using Optuna
165+
study = optuna.create_study(direction='maximize')
166+
study.optimize(objective, n_trials=50)
167+
168+
# Print the best trial and parameters found
169+
print("Best trial:")
170+
best_trial = study.best_trial
171+
print(f" Value: {best_trial.value}")
172+
print(" Params: ")
173+
for key, value in best_trial.params.items():
174+
print(f" {key}: {value}")
175+
176+
# Use the best parameters to train the final model
177+
best_params = best_trial.params
178+
179+
# Define objective function for Optuna
180+
def objective(trial):
181+
# Define hyperparameters to search
182+
param = {
183+
'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
184+
'max_depth': trial.suggest_int('max_depth', 3, 10),
185+
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
186+
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
187+
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
188+
'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
189+
'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
190+
'verbosity': 0,
191+
}
192+
193+
xgb = XGBRegressor(**param)
194+
195+
# Fit the model on training data
196+
xgb.fit(X_train, y_train)
197+
198+
# Predict on the validation set
199+
y_pred = xgb.predict(X_test)
200+
201+
r2 = r2_score(y_test, y_pred)
202+
203+
return r2
204+
205+
# Perform hyperparameter optimization using Optuna
206+
study = optuna.create_study(direction='maximize')
207+
study.optimize(objective, n_trials=50)
208+
209+
# Print the best trial and parameters found
210+
print("Best trial:")
211+
best_trial = study.best_trial
212+
print(f" Value: {best_trial.value}")
213+
print(" Params: ")
214+
for key, value in best_trial.params.items():
215+
print(f" {key}: {value}")
216+
217+
# Use the best parameters to train the final model
218+
best_params = best_trial.params
219+
220+
xgb_normal = XGBRegressor(**best_params)
221+
222+
223+
xgb_normal.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test,y_test)],verbose=0)
224+
225+
# Make predictions on the test set
226+
y_pred_test = xgb_normal.predict(X_test)
227+
228+
mae = mean_absolute_error(y_test,y_pred_test)
229+
230+
231+
print("Test MAE:",mae )
232+
233+
results = xgb_normal.evals_result()
234+
val_rmse = results["validation_1"]['rmse']
235+
best_epopch = min(val_rmse)
236+
i_best_epoch = val_rmse.index(best_epopch)
237+
epochs = len(results['validation_0']['rmse'])
238+
x_axis = range(0, epochs)
239+
240+
# plot m log loss
241+
fig, ax = plt.subplots()
242+
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
243+
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
244+
ax.plot(i_best_epoch, best_epopch, marker="o", color="green", label="Best")
245+
ax.legend()
246+
plt.ylabel('rmse')
247+
plt.title('XGBoost rmse')
248+
plt.show()
249+
250+
plt.scatter(y_test, y_pred_test, alpha=0.7, label='Real')
251+
plt.plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()], '--', c='.3')
252+
plt.xlabel('Actual Values')
253+
plt.ylabel('Predicted Values')
254+
plt.title('Scatter Plot XGBoost Model')
255+
plt.show()
256+
257+
residuals = y_test - y_pred_test
258+
plt.scatter(y_test, residuals, label='Residuals', alpha=0.7)
259+
plt.axhline(y=0, color='red', linestyle='--', label='Zero Residuals')
260+
plt.xlabel('Actual Values')
261+
plt.ylabel('Residuals')
262+
plt.title('Residual Plot for XGBoost')
263+
plt.legend()
264+
plt.show()
265+
266+
# Optianing the most features that had an impact of our price
267+
def plot_feature_importance(model, feature_names=None, plot=True):
268+
269+
feature_importance = model.feature_importances_
270+
271+
if feature_names is None:
272+
feature_names = model.feature_name()
273+
274+
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
275+
276+
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
277+
278+
if plot:
279+
plt.figure(figsize=(10, 10))
280+
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
281+
plt.title('Feature Importance')
282+
plt.show()
283+
284+
return feature_importance_df
285+
286+
feature_importance_df = plot_feature_importance(xgb_normal,feature_names=df.drop(columns=[target,'Duration']).columns)

0 commit comments

Comments
 (0)