44import json
55import random
66import argparse
7+ from tqdm import tqdm
78
89class AmazonScraper :
910 user_agents = [
@@ -52,56 +53,48 @@ def start_scraping(self):
5253 self .product_writer .writerow (["product_name" , "product_images" , "number_of_reviews" , "price" , "product_url" , "asin" ])
5354 if self .review :
5455 self .review_writer .writerow (["product_name" , "product_reviews" , "product_url" , "asin" ])
55- for page in range (1 , self .pages + 1 ):
56+ for page in tqdm ( range (1 , self .pages + 1 ), desc = "Scraping Pages" ):
5657 url = self .url + "&page=" + str (page )
5758 headers = {"User-Agent" : random .choice (self .user_agents )}
5859 response = requests .get (url , headers = headers )
5960 soup = BeautifulSoup (response .content , "html.parser" )
6061 products = soup .find_all ("div" , {"class" : "sg-col-inner" })
6162
6263 for product in products :
63- # Product name
6464 name = product .find ("span" , {"class" : "a-size-medium a-color-base a-text-normal" })
6565 if name is not None :
6666 name = name .text
6767 else :
68- continue # Skip if no product name
68+ continue
6969
70- # Product images
7170 images = product .find_all ("img" , {"class" : "s-image" })
7271 if images is not None :
7372 images = [image ['src' ] for image in images ]
7473 else :
7574 images = []
7675
77- # Number of Reviews
7876 number_of_reviews = product .find ("span" , {"class" : "a-size-base" })
7977 if number_of_reviews is not None :
8078 number_of_reviews = number_of_reviews .text
8179 else :
8280 number_of_reviews = ''
8381
84- # Price
8582 price = product .find ("span" , {"class" : "a-offscreen" })
8683 if price is not None :
8784 price = price .text
8885 else :
8986 price = ''
9087
91- # Product URL
9288 product_url = product .find ("a" , {"class" : "a-link-normal" })
9389 if product_url is not None :
9490 product_url = f'https://www.amazon.{ self .locale } ' + product_url ['href' ]
9591 else :
9692 product_url = ''
9793
98- # ASIN
9994 asin = product_url .split ("/dp/" )[1 ].split ("/" )[0 ] if "/dp/" in product_url else ''
10095
101- # Write to CSV
10296 self .product_writer .writerow ([name , ", " .join (images ), number_of_reviews , price , product_url , asin ])
10397
104- # Add to JSON data
10598 self .product_json_data .append ({
10699 "product_name" : name ,
107100 "product_images" : images ,
@@ -111,7 +104,6 @@ def start_scraping(self):
111104 "asin" : asin
112105 })
113106
114- # Product reviews
115107 if self .review :
116108 product_reviews = []
117109 review_url = f'https://www.amazon.{ self .locale } /product-reviews/{ asin } '
@@ -121,10 +113,8 @@ def start_scraping(self):
121113 for review in reviews :
122114 product_reviews .append (review .text .strip ())
123115
124- # Write to CSV
125116 self .review_writer .writerow ([name , ", " .join (product_reviews ), product_url , asin ])
126117
127- # Add to JSON data
128118 self .review_json_data .append ({
129119 "product_name" : name ,
130120 "product_reviews" : product_reviews ,
0 commit comments