Merge pull request #7 from jkortus/fix-2024

bython17 · web-flow · commit 0acd856c5a97 · 2024-09-21T13:01:27.000+03:00
Fix for 2024 web pages and package versions
diff --git a/cwm_downloader/scraper/_scraper.py b/cwm_downloader/scraper/_scraper.py
@@ -18,7 +18,7 @@ class Scraper(ABC):
     for Lecture and Course.
     """
 
-    base_url = "https://codewithmosh.com"
+    base_url = "https://members.codewithmosh.com"
     # ElementSelectors is an enum containing all the css selectors needed
     # To scrape the site.
     element_selectors = ElementSelectors
diff --git a/cwm_downloader/scraper/course_scraper.py b/cwm_downloader/scraper/course_scraper.py
@@ -122,7 +122,7 @@ def get_all_sections(self):
         # get a list of all section_names or lecture_anchor_tags which is not wanted. So by selecting the container
         # we can pass the section_container as the source to Scraper.select_element.
         section_lectures = {
-            f"{index + 1}- {self.select_element(self.element_selectors.section_names, section_container, single=True).get_text(strip=True)}": [
+            f"{index + 1}-{self.select_element(self.element_selectors.section_names, section_container, single=True).get_text(strip=True)}": [
                 # Since we get the relative url of the lectures when using the href attribute of the anchor tags,
                 # we can use urljoin which smartly joins the base url with the relative url.
                 Lecture(urljoin(self.base_url, lecture.get('href')), self.session, self.timeout) for lecture in self.select_element(self.element_selectors.lecture_anchor_tags, section_container)
diff --git a/cwm_downloader/scraper/lecture_scraper.py b/cwm_downloader/scraper/lecture_scraper.py
@@ -20,7 +20,7 @@ class Lecture(Scraper):
     with a download class).
     """
 
-    def get_download_names_and_urls(self) -> Dict[str, str] | None:
+    def get_download_names_and_urls(self) -> Dict[str|None, str] | None:
         """ Get all downloadable urls with their filenames as a dictionary. """
         # Select all elements with the download_tags element selector
         # Recognize that raise_if_not_found is False that makes it so even if
@@ -29,7 +29,16 @@ def get_download_names_and_urls(self) -> Dict[str, str] | None:
         download_tags = self.select_element(self.element_selectors.download_tags, raise_if_not_found=False)
         if download_tags is None:
             return None
-        return {tag.get('data-x-origin-download-name'): tag.get('href') for tag in download_tags}
+        result = {}
+        for tag in download_tags:
+            orig_name = tag.get("data-x-origin-download-name")
+            url = tag.get("href")
+            if "://" not in url:
+                # This means that the url is a relative url so we need to join it with the base url
+                url = self.base_url + url
+            result[orig_name] = url
+            # having result[None] will result in only one entry for all links without data-x-origin-download-name
+        return result
 
     def decompose_elements(self, element_selectors: Iterable[str], source: Tag) -> None:
         """ 
@@ -85,7 +94,15 @@ def get_resource_name(self, bare_resource_name: str) -> str:
             resource_name = '-'.join(bare_resource_name.split('-')[1:]).strip()
         else:
             resource_name = bare_resource_name.strip()
-        return f"{lecture_number}- resource_{resource_name}"
+        # if we have no extension, assume it's a video in mp4 format
+        if "." not in resource_name:
+            resource_name += ".mp4"
+        try:
+            # Try to format the lecture number to a 2 digit number
+            lecture_number = f"{int(str(lecture_number)):02d}"
+        except ValueError:
+            pass  # not interested, if it's not a number, keep the string as it is
+        return f"{lecture_number}-resource_{resource_name}"
 
     @staticmethod
     def should_overwrite(file_path: Path, noconfirm=False):
@@ -126,7 +143,14 @@ def download(self, base_dir: Path,  chunk_size: int = 4096, noconfirm=False):
                     self.__download_text(file_path, progress_bar, current_task_id)
         if download_names_urls is not None:
             for download_name, download_url in download_names_urls.items():
-                filename = sterialize_file_or_folder(download_name)
+                if download_name is None:
+                    # get the web page title and use it as the filename
+                    download_name = self.get_name()
+                    filename = sterialize_file_or_folder(
+                        self.get_resource_name(download_name)
+                    )
+                else:
+                    filename = sterialize_file_or_folder(download_name)
                 if filename.split('.')[-1] != 'mp4':
                     # This means that the downloadable thing is a resource so
                     # we use the self.get_resource_name to get the resource name
diff --git a/cwm_downloader/utils.py b/cwm_downloader/utils.py
@@ -107,6 +107,9 @@ def get_credentials() -> credentials_type:
     # and raise an error if they are not.
     if 'headers' not in credentials_dict or 'cookies' not in credentials_dict:
         raise InvalidCredentialsError('The contents in credentials.json are invalid.')
+    # remove Accept-Encoding from the headers to remove compression
+    if "Accept-Encoding" in credentials_dict["headers"]:
+        del credentials_dict["headers"]["Accept-Encoding"]
     return credentials_dict
 
 
diff --git a/poetry.lock b/poetry.lock