Bladeren bron

Drastically improve input data validation / normalization

jherve 1 jaar geleden
bovenliggende
commit
c5eb2f2178

+ 41 - 13
src/de_quoi_parle_le_monde/article.py

@@ -1,6 +1,6 @@
 import asyncio
 from abc import ABC, abstractmethod
-from attrs import frozen, field
+from attrs import frozen, field, validators
 import cattrs
 from bs4 import BeautifulSoup
 from yarl import URL
@@ -11,31 +11,48 @@ from de_quoi_parle_le_monde.internet_archive import InternetArchiveSnapshot
 cattrs.register_structure_hook(URL, lambda v, _: URL(v))
 
 
+def url_is_absolute(instance, attribute, value: URL):
+    if not value.is_absolute():
+        raise ValueError("URL of articles must be absolute")
+
+
 @frozen
 class FeaturedArticle:
-    url: URL
-
-    @classmethod
-    def from_internet_archive_url(cls, url_str: str) -> "FeaturedArticle":
-        url = URL(url_str)
-        original_str = url.path.split("/", 3)[-1]
-        return cattrs.structure({"url": original_str}, cls)
+    url: URL = field(validator=[url_is_absolute])
 
 
 @frozen
 class FeaturedArticleSnapshot(ABC):
-    title: str
-    url: str
+    title: str = field(validator=validators.min_len(1))
+    url: URL = field(validator=[url_is_absolute])
     original: FeaturedArticle
 
     @classmethod
     def create(cls, title, url):
+        absolute = cls.clean_web_archive_url(url)
         attrs = dict(
             title=title,
-            url=url,
-            original=FeaturedArticle.from_internet_archive_url(url),
+            url=absolute,
+            original={"url": cls.extract_url_from_web_archive(absolute)},
         )
-        return cls(**attrs)
+        return cattrs.structure(attrs, cls)
+
+    @staticmethod
+    def extract_url_from_web_archive(url: URL):
+        # This extract e.g. this URL
+        # https://www.lemonde.fr/economie/article/2024/05/22/totalenergies-cent-bougies-et-un-feu-de-critiques_6234759_3234.html
+        # from an URL that looks like :
+        # http://web.archive.org/web/20240522114811/https://www.lemonde.fr/economie/article/2024/05/22/totalenergies-cent-bougies-et-un-feu-de-critiques_6234759_3234.html
+        return url.path.split("/", 3)[-1]
+
+    @staticmethod
+    def clean_web_archive_url(url_str: str):
+        parsed = URL(url_str)
+        if parsed.is_absolute():
+            return parsed
+        else:
+            base = URL("https://web.archive.org")
+            return base.join(parsed)
 
 
 @frozen
@@ -43,11 +60,22 @@ class TopArticle(ABC):
     article: FeaturedArticleSnapshot
     rank: int
 
+    @classmethod
+    def create(cls, title, url, rank):
+        article = FeaturedArticleSnapshot.create(title, url)
+        attrs = {"article": cattrs.unstructure(article), "rank": rank}
+        return cattrs.structure(attrs, cls)
+
 
 @frozen
 class MainArticle(ABC):
     article: FeaturedArticleSnapshot
 
+    @classmethod
+    def create(cls, title, url):
+        article = FeaturedArticleSnapshot.create(title, url)
+        return cls(article)
+
 
 @frozen
 class MainPage(ABC):

+ 6 - 10
src/de_quoi_parle_le_monde/medias/bfmtv.py

@@ -1,5 +1,4 @@
 from de_quoi_parle_le_monde.article import (
-    FeaturedArticleSnapshot,
     TopArticle,
     MainArticle,
     MainPage,
@@ -12,10 +11,9 @@ class BfmTvMainPage(MainPage):
     def get_top_articles(soup):
         all_articles = soup.select("section[id*='top_contenus'] li > a")
         return [
-            TopArticle(
-                article=FeaturedArticleSnapshot.create(
-                    title=to_text(a, "h3"), url=a["href"]
-                ),
+            TopArticle.create(
+                title=to_text(a, "h3"),
+                url=a["href"],
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -28,9 +26,7 @@ class BfmTvMainPage(MainPage):
             return link["href"]
 
         [main] = soup.select("article.une_item")
-        return MainArticle(
-            article=FeaturedArticleSnapshot.create(
-                title=to_text(main, "h2.title_une_item"),
-                url=to_href(main),
-            )
+        return MainArticle.create(
+            title=to_text(main, "h2.title_une_item"),
+            url=to_href(main),
         )

+ 6 - 10
src/de_quoi_parle_le_monde/medias/cnews.py

@@ -1,7 +1,6 @@
 from bs4 import BeautifulSoup
 
 from de_quoi_parle_le_monde.article import (
-    FeaturedArticleSnapshot,
     TopArticle,
     MainArticle,
     MainPage,
@@ -15,10 +14,9 @@ class CNewsMainPage(MainPage):
         all_articles = soup.select(".top-news-content a")
 
         return [
-            TopArticle(
-                article=FeaturedArticleSnapshot.create(
-                    title=to_text(a, "h3.dm-letop-title"), url=a["href"]
-                ),
+            TopArticle.create(
+                title=to_text(a, "h3.dm-letop-title"),
+                url=a["href"],
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -29,9 +27,7 @@ class CNewsMainPage(MainPage):
         main = soup.select("div.dm-block")[0]
         [url] = main.select("a")
 
-        return MainArticle(
-            article=FeaturedArticleSnapshot.create(
-                title=to_text(main, "h2.dm-news-title"),
-                url=url["href"],
-            )
+        return MainArticle.create(
+            title=to_text(main, "h2.dm-news-title"),
+            url=url["href"],
         )

+ 6 - 11
src/de_quoi_parle_le_monde/medias/france_tv_info.py

@@ -1,5 +1,4 @@
 from de_quoi_parle_le_monde.article import (
-    FeaturedArticleSnapshot,
     TopArticle,
     MainArticle,
     MainPage,
@@ -17,11 +16,9 @@ class FranceTvInfoMainPage(MainPage):
         all_articles = soup.select("article.card-article-most-read")
 
         return [
-            TopArticle(
-                article=FeaturedArticleSnapshot.create(
-                    title=to_text(a, "p.card-article-most-read__title"),
-                    url=to_href(a, "a"),
-                ),
+            TopArticle.create(
+                title=to_text(a, "p.card-article-most-read__title"),
+                url=to_href(a, "a"),
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -42,9 +39,7 @@ class FranceTvInfoMainPage(MainPage):
             main, ".card-article-majeure__title", ".card-article-actu-forte__title"
         )
 
-        return MainArticle(
-            article=FeaturedArticleSnapshot.create(
-                title=title.text.strip(),
-                url=main.find("a")["href"],
-            )
+        return MainArticle.create(
+            title=title.text.strip(),
+            url=main.find("a")["href"],
         )

+ 6 - 10
src/de_quoi_parle_le_monde/medias/le_monde.py

@@ -1,5 +1,4 @@
 from de_quoi_parle_le_monde.article import (
-    FeaturedArticleSnapshot,
     TopArticle,
     MainArticle,
     MainPage,
@@ -12,10 +11,9 @@ class LeMondeMainPage(MainPage):
     def get_top_articles(soup):
         all_articles = soup.select("div.top-article")
         return [
-            TopArticle(
-                article=FeaturedArticleSnapshot.create(
-                    title=a.text.strip(), url=a.find("a")["href"]
-                ),
+            TopArticle.create(
+                title=a.text.strip(),
+                url=a.find("a")["href"],
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -28,9 +26,7 @@ class LeMondeMainPage(MainPage):
             return link["href"]
 
         [main] = soup.select("div.article--main")
-        return MainArticle(
-            article=FeaturedArticleSnapshot.create(
-                title=to_text(main, "p.article__title-label"),
-                url=to_href(main),
-            )
+        return MainArticle.create(
+            title=to_text(main, "p.article__title-label"),
+            url=to_href(main),
         )

+ 2 - 2
src/de_quoi_parle_le_monde/storage.py

@@ -575,9 +575,9 @@ class Storage:
                 "featured_article_snapshots",
                 ["title", "url", "featured_article_id"],
             ),
-            [article.title, article.url, featured_article_id],
+            [article.title, str(article.url), featured_article_id],
             "SELECT id FROM featured_article_snapshots WHERE featured_article_id = $1 AND url = $2",
-            [featured_article_id, article.url],
+            [featured_article_id, str(article.url)],
         )
 
     async def _add_main_article(self, conn, snapshot_id: int, article_id: int):