пре 1 година · 4150de15fb
--- a/src/de_quoi_parle_le_monde/http_client.py
+++ b/src/de_quoi_parle_le_monde/http_client.py
@@ -0,0 +1,16 @@
 
				+import requests_cache
			
 
				+from aiohttp_client_cache import CachedSession, SQLiteBackend
			
 
				+
			
 
				+
			
 
				+class HttpClient:
			
 
				+    def __init__(self):
			
 
				+        self.http_session = requests_cache.CachedSession("ia", backend="sqlite")
			
 
				+        self.cache = SQLiteBackend("http")
			
 
				+
			
 
				+    def get(self, url, params=None):
			
 
				+        return self.http_session.get(url, params)
			
 
				+
			
 
				+    async def aget(self, url, params=None):
			
 
				+        async with CachedSession(cache=SQLiteBackend("http")) as session:
			
 
				+            async with session.get(url, allow_redirects=True, params=params) as resp:
			
 
				+                return await resp.text()
			
--- a/src/de_quoi_parle_le_monde/internet_archive.py
+++ b/src/de_quoi_parle_le_monde/internet_archive.py
@@ -0,0 +1,90 @@
 
				+from attrs import frozen
			
 
				+from typing import Optional, ClassVar
			
 
				+from datetime import date, datetime
			
 
				+import cattrs
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from http_client import HttpClient
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class CdxRecord:
			
 
				+    urlkey: str
			
 
				+    timestamp: int
			
 
				+    original: str
			
 
				+    mimetype: str
			
 
				+    statuscode: int
			
 
				+    digest: str
			
 
				+    length: int
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def parse_line(line: str):
			
 
				+        return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class CdxRequest:
			
 
				+    url: str
			
 
				+    filter: Optional[str] = None
			
 
				+    from_: Optional[date | datetime] = None
			
 
				+    to_: Optional[date | datetime] = None
			
 
				+    limit: Optional[int] = None
			
 
				+
			
 
				+    translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
			
 
				+    date_format: ClassVar[str] = "%Y%m%d"
			
 
				+    datetime_format: ClassVar[str] = "%Y%m%d%H%M%S"
			
 
				+
			
 
				+    def into_params(self) -> dict[str, str]:
			
 
				+        return {
			
 
				+            self._translate_key(k): self._stringify_value(v)
			
 
				+            for k, v in cattrs.unstructure(self).items()
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _translate_key(cls, key: str) -> str:
			
 
				+        return cls.translation_dict.get(key, key)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _stringify_value(cls, v) -> str:
			
 
				+        if isinstance(v, date):
			
 
				+            return v.strftime(cls.date_format)
			
 
				+        elif isinstance(v, datetime):
			
 
				+            return v.strftime(cls.datetime_format)
			
 
				+        else:
			
 
				+            return str(v)
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class InternetArchiveSnapshot:
			
 
				+    timestamp: str
			
 
				+    original: str
			
 
				+
			
 
				+    @property
			
 
				+    def url(self):
			
 
				+        return f"http://web.archive.org/web/{self.timestamp}/{self.original}"
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_record(rec: CdxRecord):
			
 
				+        return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
			
 
				+
			
 
				+
			
 
				+class InternetArchiveClient:
			
 
				+    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.client = HttpClient()
			
 
				+
			
 
				+    async def search_snapshots(self, req: CdxRequest):
			
 
				+        def to_snapshot(line):
			
 
				+            record = CdxRecord.parse_line(line)
			
 
				+            return InternetArchiveSnapshot.from_record(record)
			
 
				+
			
 
				+        resp = await self.client.aget(
			
 
				+            "http://web.archive.org/cdx/search/cdx?", req.into_params()
			
 
				+        )
			
 
				+
			
 
				+        return [to_snapshot(line) for line in resp.splitlines()]
			
 
				+
			
 
				+    async def fetch_and_parse_snapshot(self, snap: InternetArchiveSnapshot):
			
 
				+        resp = await self.client.aget(snap.url)
			
 
				+        return BeautifulSoup(resp, "html.parser")
			
--- a/src/de_quoi_parle_le_monde/le_monde.py
+++ b/src/de_quoi_parle_le_monde/le_monde.py
@@ -0,0 +1,45 @@
 
				+from attrs import frozen
			
 
				+import cattrs
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from internet_archive import InternetArchiveSnapshot
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class LeMondeTopArticle:
			
 
				+    title: str
			
 
				+    url: str
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_soup(soup: BeautifulSoup):
			
 
				+        return cattrs.structure(
			
 
				+            dict(title=soup.text.strip(), url=soup.find("a")["href"]), LeMondeTopArticle
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class LeMondeMainArticle:
			
 
				+    title: str
			
 
				+    url: str
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def from_soup(soup: BeautifulSoup):
			
 
				+        attrs = dict(title=soup.find("h1").text.strip(), url=soup.find("a")["href"])
			
 
				+        return cattrs.structure(attrs, LeMondeMainArticle)
			
 
				+
			
 
				+
			
 
				+@frozen
			
 
				+class LeMondeMainPage:
			
 
				+    snapshot: InternetArchiveSnapshot
			
 
				+    soup: BeautifulSoup
			
 
				+
			
 
				+    def get_top_articles(self):
			
 
				+        return [
			
 
				+            LeMondeTopArticle.from_soup(s)
			
 
				+            for s in self.soup.find_all("div", class_="top-article")
			
 
				+        ]
			
 
				+
			
 
				+    def main_article(self):
			
 
				+        return LeMondeMainArticle.from_soup(
			
 
				+            self.soup.find("div", class_="article--main")
			
 
				+        )
			
--- a/src/de_quoi_parle_le_monde/main.py
+++ b/src/de_quoi_parle_le_monde/main.py
@@ -1,141 +1,8 @@
 
				-import requests_cache
			
 
				-from attrs import frozen
			
 
				-from typing import Optional, ClassVar
			
 
				-from datetime import date, datetime, timedelta
			
 
				-import cattrs
			
 
				-from bs4 import BeautifulSoup
			
 
				-from aiohttp_client_cache import CachedSession, SQLiteBackend
			
 
				+from datetime import date, timedelta
			
 
				 import asyncio
			
 
				 
			
 
				-
			
 
				-@frozen
			
 
				-class CdxRecord:
			
 
				-    urlkey: str
			
 
				-    timestamp: int
			
 
				-    original: str
			
 
				-    mimetype: str
			
 
				-    statuscode: int
			
 
				-    digest: str
			
 
				-    length: int
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def parse_line(line: str):
			
 
				-        return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
			
 
				-
			
 
				-
			
 
				-@frozen
			
 
				-class CdxRequest:
			
 
				-    url: str
			
 
				-    filter: Optional[str] = None
			
 
				-    from_: Optional[date | datetime] = None
			
 
				-    to_: Optional[date | datetime] = None
			
 
				-    limit: Optional[int] = None
			
 
				-
			
 
				-    translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
			
 
				-    date_format: ClassVar[str] = "%Y%m%d"
			
 
				-    datetime_format: ClassVar[str] = "%Y%m%d%H%M%S"
			
 
				-
			
 
				-    def into_params(self) -> dict[str, str]:
			
 
				-        return {
			
 
				-            self._translate_key(k): self._stringify_value(v)
			
 
				-            for k, v in cattrs.unstructure(self).items()
			
 
				-        }
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _translate_key(cls, key: str) -> str:
			
 
				-        return cls.translation_dict.get(key, key)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _stringify_value(cls, v) -> str:
			
 
				-        if isinstance(v, date):
			
 
				-            return v.strftime(cls.date_format)
			
 
				-        elif isinstance(v, datetime):
			
 
				-            return v.strftime(cls.datetime_format)
			
 
				-        else:
			
 
				-            return str(v)
			
 
				-
			
 
				-
			
 
				-class HttpClient:
			
 
				-    def __init__(self):
			
 
				-        self.http_session = requests_cache.CachedSession("ia", backend="sqlite")
			
 
				-        self.cache = SQLiteBackend("http")
			
 
				-
			
 
				-    def get(self, url, params=None):
			
 
				-        return self.http_session.get(url, params)
			
 
				-
			
 
				-    async def aget(self, url, params=None):
			
 
				-        async with CachedSession(cache=SQLiteBackend("http")) as session:
			
 
				-            async with session.get(url, allow_redirects=True, params=params) as resp:
			
 
				-                return await resp.text()
			
 
				-
			
 
				-
			
 
				-@frozen
			
 
				-class InternetArchiveSnapshot:
			
 
				-    timestamp: str
			
 
				-    original: str
			
 
				-
			
 
				-    @property
			
 
				-    def url(self):
			
 
				-        return f"http://web.archive.org/web/{self.timestamp}/{self.original}"
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def from_record(rec: CdxRecord):
			
 
				-        return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
			
 
				-
			
 
				-
			
 
				-@frozen
			
 
				-class LeMondeTopArticle:
			
 
				-    title: str
			
 
				-    url: str
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def from_soup(soup: BeautifulSoup):
			
 
				-        return cattrs.structure(dict(title=soup.text.strip(), url=soup.find("a")["href"]), LeMondeTopArticle)
			
 
				-
			
 
				-
			
 
				-@frozen
			
 
				-class LeMondeMainArticle:
			
 
				-    title: str
			
 
				-    url: str
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def from_soup(soup: BeautifulSoup):
			
 
				-        attrs = dict(title=soup.find("h1").text.strip(), url=soup.find("a")["href"])
			
 
				-        return cattrs.structure(attrs, LeMondeMainArticle)
			
 
				-
			
 
				-
			
 
				-@frozen
			
 
				-class LeMondeMainPage:
			
 
				-    snapshot: InternetArchiveSnapshot
			
 
				-    soup: BeautifulSoup
			
 
				-
			
 
				-    def get_top_articles(self):
			
 
				-        return [LeMondeTopArticle.from_soup(s) for s in self.soup.find_all("div", class_="top-article")]
			
 
				-
			
 
				-    def main_article(self):
			
 
				-        return LeMondeMainArticle.from_soup(self.soup.find("div", class_="article--main"))
			
 
				-
			
 
				-
			
 
				-class InternetArchiveClient:
			
 
				-    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.client = HttpClient()
			
 
				-
			
 
				-    async def search_snapshots(self, req: CdxRequest):
			
 
				-        def to_snapshot(line):
			
 
				-            record = CdxRecord.parse_line(line)
			
 
				-            return InternetArchiveSnapshot.from_record(record)
			
 
				-
			
 
				-        resp = await self.client.aget(
			
 
				-            "http://web.archive.org/cdx/search/cdx?", req.into_params()
			
 
				-        )
			
 
				-
			
 
				-        return [to_snapshot(line) for line in resp.splitlines()]
			
 
				-
			
 
				-    async def fetch_and_parse_snapshot(self, snap: InternetArchiveSnapshot):
			
 
				-        resp = await self.client.aget(snap.url)
			
 
				-        return BeautifulSoup(resp, "html.parser")
			
 
				+from internet_archive import InternetArchiveClient, CdxRequest
			
 
				+from le_monde import LeMondeMainPage
			
 
				 
			
 
				 
			
 
				 async def get_latest_snaps():