11from bs4 import BeautifulSoup
22import requests
3+ from fake_useragent import UserAgent
34
5+ def requestUrl_and_bs4 (url :str ):
6+ # All request and parser goes through here
7+ agents = UserAgent ().random
8+ user_agent = {"User-Agent" : agents }
49
5- def getMovieDetails (movieName ):
10+ # Fixed the requests (changed status code from 403 to 200)
11+ html = requests .get (url , headers = user_agent )
12+ soup = BeautifulSoup (html .text , 'html.parser' )
13+
14+ return soup
15+
16+
17+ def getMovieDetails (movieName :str ):
618 url = 'https://www.imdb.com'
719 query = '/search/title?title='
820 movieDetails = {}
921 movienamequery = query + '+' .join (movieName .strip ().split (' ' ))
22+ website_url = url + movienamequery + '&title_type=feature'
1023
11- html = requests .get (url + movienamequery + '&title_type=feature' )
12- bs = BeautifulSoup (html .text , 'html.parser' )
13- result = bs .find ('h3' , {'class' : 'lister-item-header' })
24+ bs = requestUrl_and_bs4 (website_url )
1425
26+ result = bs .find ('a' , {'class' : 'ipc-title-link-wrapper' })
1527 if result is None :
1628 return None
1729
18- movielink = url + result .a .attrs ['href' ]
19- movieDetails ['name' ] = result .a .text
30+ movielink = url + result .attrs ['href' ]
2031
21- html = requests .get (movielink )
22- bs = BeautifulSoup (html .text , 'html.parser' )
32+ bs = requestUrl_and_bs4 (movielink )
33+
34+ # Fix the movie name
35+ movieDetails ['name' ] = bs .find ('h1' , {'data-testid' : 'hero__pageTitle' }).text
36+
37+ # Fix year, runtime
38+ box_one = bs .find ('div' , {'class' : 'sc-b7c53eda-0 dUpRPQ' }).ul
39+ box = box_one .find_all ('li' )
2340 try :
24- movieDetails ['year' ] = bs . find ( 'span' , { 'id' : 'titleYear' }). a .text
41+ movieDetails ['year' ] = box [ 0 ] .text
2542 except AttributeError :
2643 movieDetails ['year' ] = 'Not available'
27- subtext = bs .find ('div' , {'class' : 'subtext' })
2844
45+ # Fix genres
46+ box_two = bs .find ('div' , {'data-testid' : "genres" })
2947 movieDetails ['genres' ] = [
30- i .text for i in subtext .findAll ('a' , {'title' : None })]
48+ i .text for i in box_two .select ('div.ipc-chip-list__scroller>a>span' )]
49+
50+ # Fix ratings
3151 try :
32- movieDetails ['rating' ] = bs .find (
33- 'div' , {'class ' : 'ratingValue ' }).span .text
34- movieDetails ['runtime' ] = subtext . time .text .strip ()
52+ movieDetails ['rating' ] = f" { bs .find (
53+ 'div' , {'data-testid ' : 'hero-rating-bar__aggregate-rating__score ' }).span .text } /10 ( { bs . find ( 'div' , { 'class' : 'sc-bde20123-3 gPVQxL' }). text } )"
54+ movieDetails ['runtime' ] = box [ 2 ] .text .strip ()
3555 except AttributeError :
3656 movieDetails ['rating' ] = 'Not yet rated'
3757 movieDetails ['runtime' ] = 'Not available'
38- movieDetails ['release_date' ] = subtext .find (
39- 'a' , {'title' : 'See more release dates' }).text .strip ()
4058
41- creditSummary = bs .findAll ('div' , {'class' : 'credit_summary_item' })
59+ # To get movie release date
60+ movie_release_dates_url = f"{ url } { box [0 ].a .attrs ['href' ]} "
61+ soup = requestUrl_and_bs4 (movie_release_dates_url )
62+
63+ movieDetails ['release_date' ] = soup .select_one ('#rel_1 > div > ul > li > span.ipc-metadata-list-item__list-content-item' ).text
4264
43- movieDetails ['directors' ] = [i .text for i in creditSummary [0 ].findAll ('a' )]
44- movieDetails ['writers' ] = [i .text for i in creditSummary [1 ].findAll (
45- 'a' ) if 'name' in i .attrs ['href' ]]
65+ creditSummary = bs .select ('div.ipc-metadata-list-item__content-container > ul' )
66+
67+ movieDetails ['directors' ] = [i .text for i in creditSummary [0 ].select ('li>a' )]
68+
4669 try :
47- movieDetails ['cast' ] = [i .text for i in creditSummary [2 ].findAll (
48- 'a' ) if 'name' in i . attrs [ 'href' ] ]
70+ movieDetails ['cast' ] = [i .text for i in creditSummary [2 ].select ( 'li>a' )]
71+ movieDetails [ 'writers' ] = [ i . text for i in creditSummary [ 1 ]. select ( 'li>a' ) ]
4972
5073 except IndexError :
5174 movieDetails ['cast' ]= movieDetails ['writers' ]
5275 movieDetails ['writers' ]= 'Not found'
53- html = requests .get (movielink + 'plotsummary' )
54- bs = BeautifulSoup (html .text , 'html.parser' )
55-
56- movieDetails ['plot' ] = bs .find (
57- 'li' , {'class' : 'ipl-zebra-list__item' }).p .text .strip ()
5876
77+ movieDetails ['plot' ] = bs .find ('span' , {'data-testid' : 'plot-l' }).text .strip ()
78+
5979 return movieDetails
6080
61-
62- if __name__ == "__main__" :
81+ def main ():
6382 movieName = input ('Enter the movie name : \n ' )
6483 movieDetails = getMovieDetails (movieName )
6584 if movieDetails is None :
@@ -75,3 +94,6 @@ def getMovieDetails(movieName):
7594 print ('Writer:' , ', ' .join (movieDetails ['writers' ]))
7695 print ('Cast:' , ', ' .join (movieDetails ['cast' ]))
7796 print ('Plot Summary:\n ' , movieDetails ['plot' ])
97+
98+ if __name__ == "__main__" :
99+ main ()
0 commit comments