Skip to content

Commit 0acd856

Browse files
authored
Merge pull request #7 from jkortus/fix-2024
Fix for 2024 web pages and package versions
2 parents 2c47b86 + 51c473d commit 0acd856

File tree

5 files changed

+338
-222
lines changed

5 files changed

+338
-222
lines changed

cwm_downloader/scraper/_scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class Scraper(ABC):
1818
for Lecture and Course.
1919
"""
2020

21-
base_url = "https://codewithmosh.com"
21+
base_url = "https://members.codewithmosh.com"
2222
# ElementSelectors is an enum containing all the css selectors needed
2323
# To scrape the site.
2424
element_selectors = ElementSelectors

cwm_downloader/scraper/course_scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def get_all_sections(self):
122122
# get a list of all section_names or lecture_anchor_tags which is not wanted. So by selecting the container
123123
# we can pass the section_container as the source to Scraper.select_element.
124124
section_lectures = {
125-
f"{index + 1}- {self.select_element(self.element_selectors.section_names, section_container, single=True).get_text(strip=True)}": [
125+
f"{index + 1}-{self.select_element(self.element_selectors.section_names, section_container, single=True).get_text(strip=True)}": [
126126
# Since we get the relative url of the lectures when using the href attribute of the anchor tags,
127127
# we can use urljoin which smartly joins the base url with the relative url.
128128
Lecture(urljoin(self.base_url, lecture.get('href')), self.session, self.timeout) for lecture in self.select_element(self.element_selectors.lecture_anchor_tags, section_container)

cwm_downloader/scraper/lecture_scraper.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class Lecture(Scraper):
2020
with a download class).
2121
"""
2222

23-
def get_download_names_and_urls(self) -> Dict[str, str] | None:
23+
def get_download_names_and_urls(self) -> Dict[str|None, str] | None:
2424
""" Get all downloadable urls with their filenames as a dictionary. """
2525
# Select all elements with the download_tags element selector
2626
# Recognize that raise_if_not_found is False that makes it so even if
@@ -29,7 +29,16 @@ def get_download_names_and_urls(self) -> Dict[str, str] | None:
2929
download_tags = self.select_element(self.element_selectors.download_tags, raise_if_not_found=False)
3030
if download_tags is None:
3131
return None
32-
return {tag.get('data-x-origin-download-name'): tag.get('href') for tag in download_tags}
32+
result = {}
33+
for tag in download_tags:
34+
orig_name = tag.get("data-x-origin-download-name")
35+
url = tag.get("href")
36+
if "://" not in url:
37+
# This means that the url is a relative url so we need to join it with the base url
38+
url = self.base_url + url
39+
result[orig_name] = url
40+
# having result[None] will result in only one entry for all links without data-x-origin-download-name
41+
return result
3342

3443
def decompose_elements(self, element_selectors: Iterable[str], source: Tag) -> None:
3544
"""
@@ -85,7 +94,15 @@ def get_resource_name(self, bare_resource_name: str) -> str:
8594
resource_name = '-'.join(bare_resource_name.split('-')[1:]).strip()
8695
else:
8796
resource_name = bare_resource_name.strip()
88-
return f"{lecture_number}- resource_{resource_name}"
97+
# if we have no extension, assume it's a video in mp4 format
98+
if "." not in resource_name:
99+
resource_name += ".mp4"
100+
try:
101+
# Try to format the lecture number to a 2 digit number
102+
lecture_number = f"{int(str(lecture_number)):02d}"
103+
except ValueError:
104+
pass # not interested, if it's not a number, keep the string as it is
105+
return f"{lecture_number}-resource_{resource_name}"
89106

90107
@staticmethod
91108
def should_overwrite(file_path: Path, noconfirm=False):
@@ -126,7 +143,14 @@ def download(self, base_dir: Path, chunk_size: int = 4096, noconfirm=False):
126143
self.__download_text(file_path, progress_bar, current_task_id)
127144
if download_names_urls is not None:
128145
for download_name, download_url in download_names_urls.items():
129-
filename = sterialize_file_or_folder(download_name)
146+
if download_name is None:
147+
# get the web page title and use it as the filename
148+
download_name = self.get_name()
149+
filename = sterialize_file_or_folder(
150+
self.get_resource_name(download_name)
151+
)
152+
else:
153+
filename = sterialize_file_or_folder(download_name)
130154
if filename.split('.')[-1] != 'mp4':
131155
# This means that the downloadable thing is a resource so
132156
# we use the self.get_resource_name to get the resource name

cwm_downloader/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ def get_credentials() -> credentials_type:
107107
# and raise an error if they are not.
108108
if 'headers' not in credentials_dict or 'cookies' not in credentials_dict:
109109
raise InvalidCredentialsError('The contents in credentials.json are invalid.')
110+
# remove Accept-Encoding from the headers to remove compression
111+
if "Accept-Encoding" in credentials_dict["headers"]:
112+
del credentials_dict["headers"]["Accept-Encoding"]
110113
return credentials_dict
111114

112115

0 commit comments

Comments
 (0)