Jelajahi Sumber

[fix] Ensure scheme is properly set in URLs

jherve 1 tahun lalu
induk
melakukan
d8da756304
2 mengubah file dengan 14 tambahan dan 11 penghapusan
  1. 0 4
      README.md
  2. 14 7
      src/de_quoi_parle_le_monde/article.py

+ 0 - 4
README.md

@@ -35,7 +35,3 @@ WHERE
     sv.id != sv2.id
     and sv.timestamp = sv2.timestamp
 ```
-
-### Web archive URL
-
-In "Le Parisien" article snapshots, the archive url is not good, e.g. : //web.archive.org/web/20240522165852/https://www.leparisien.fr/jo-paris-2024/jo-2024-jusqua-1-900-euros-pour-les-cheminots-22-05-2024-SPE5SAOSZVAL3LE3KCMHP2ZBDE.php

+ 14 - 7
src/de_quoi_parle_le_monde/article.py

@@ -13,18 +13,23 @@ cattrs.register_structure_hook(URL, lambda v, _: URL(v))
 
 def url_is_absolute(instance, attribute, value: URL):
     if not value.is_absolute():
-        raise ValueError("URL of articles must be absolute")
+        raise ValueError(f"Expected absolute URL, got {value}")
+
+
+def url_has_scheme(instance, attribute, value: URL):
+    if len(value.scheme) == 0:
+        raise ValueError(f"Expected a scheme in URL, got {value}")
 
 
 @frozen
 class FeaturedArticle:
-    url: URL = field(validator=[url_is_absolute])
+    url: URL = field(validator=[url_is_absolute, url_has_scheme])
 
 
 @frozen
 class FeaturedArticleSnapshot(ABC):
     title: str = field(validator=validators.min_len(1))
-    url: URL = field(validator=[url_is_absolute])
+    url: URL = field(validator=[url_is_absolute, url_has_scheme])
     original: FeaturedArticle
 
     @classmethod
@@ -48,12 +53,14 @@ class FeaturedArticleSnapshot(ABC):
     @staticmethod
     def clean_web_archive_url(url_str: str):
         parsed = URL(url_str)
-        if parsed.is_absolute():
-            return parsed
-        else:
+
+        if not parsed.is_absolute():
             base = URL("https://web.archive.org")
             return base.join(parsed)
-
+        elif len(parsed.scheme) == 0:
+            return parsed.with_scheme("https")
+        else:
+            return parsed
 
 @frozen
 class TopArticle(ABC):