Procházet zdrojové kódy

Parse archive's URL to get the original article URL

jherve před 1 rokem
rodič
revize
9a77b03f00

+ 1 - 1
pdm.lock

@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:9cfacc65dfaf2b246f751386054056e6759a3cd53ddfd55dc3a206704e548cf4"
+content_hash = "sha256:eee4220e2da5545c63cad757839431089495ed121e3dafd9a1ec10dcf4264f27"
 
 [[package]]
 name = "aioboto3"

+ 1 - 0
pyproject.toml

@@ -16,6 +16,7 @@ dependencies = [
     "aiohttp-client-cache[all]>=0.11.0",
     "lxml>=5.1.0",
     "aiolimiter>=1.1.0",
+    "yarl>=1.9.4",
 ]
 requires-python = "==3.11.*"
 readme = "README.md"

+ 14 - 0
src/de_quoi_parle_le_monde/article.py

@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from attrs import frozen
 from bs4 import BeautifulSoup
+from yarl import URL
 
 from de_quoi_parle_le_monde.internet_archive import InternetArchiveSnapshot
 
@@ -9,6 +10,19 @@ from de_quoi_parle_le_monde.internet_archive import InternetArchiveSnapshot
 class FeaturedArticle(ABC):
     title: str
     url: str
+    original: URL
+
+    @staticmethod
+    def to_original_url(url: str) -> URL:
+        url = URL(url)
+        original_str = url.path.split("/", 3)[-1]
+        original = URL(original_str)
+        assert original.is_absolute(), f"{original}"
+        return original
+
+    @classmethod
+    def create(cls, title, url):
+        return cls(title, url, cls.to_original_url(url))
 
 
 @frozen

+ 2 - 2
src/de_quoi_parle_le_monde/france_tv_info.py

@@ -20,7 +20,7 @@ class FranceTvInfoMainPage(MainPage):
         all_articles = soup.find_all("article", class_="card-article-most-read")
         return [
             TopArticle(
-                article=FranceTvInfoFeaturedArticle(
+                article=FranceTvInfoFeaturedArticle.create(
                     title=a.find(
                         "p", class_="card-article-most-read__title"
                     ).text.strip(),
@@ -41,7 +41,7 @@ class FranceTvInfoMainPage(MainPage):
         )
 
         return MainArticle(
-            article=FranceTvInfoFeaturedArticle(
+            article=FranceTvInfoFeaturedArticle.create(
                 title=title.text.strip(),
                 url=main.find("a")["href"],
             )

+ 2 - 2
src/de_quoi_parle_le_monde/le_monde.py

@@ -20,7 +20,7 @@ class LeMondeMainPage(MainPage):
         all_articles = soup.find_all("div", class_="top-article")
         return [
             TopArticle(
-                article=LeMondeFeaturedArticle(
+                article=LeMondeFeaturedArticle.create(
                     title=a.text.strip(), url=a.find("a")["href"]
                 ),
                 rank=idx + 1,
@@ -32,7 +32,7 @@ class LeMondeMainPage(MainPage):
     def get_main_article(soup):
         main = soup.find("div", class_="article--main")
         return MainArticle(
-            article=LeMondeFeaturedArticle(
+            article=LeMondeFeaturedArticle.create(
                 title=main.find("p", class_="article__title-label").text.strip(),
                 url=main.find("a")["href"],
             )