jherve пре 1 година
родитељ
комит
4150de15fb

+ 16 - 0
src/de_quoi_parle_le_monde/http_client.py

@@ -0,0 +1,16 @@
+import requests_cache
+from aiohttp_client_cache import CachedSession, SQLiteBackend
+
+
+class HttpClient:
+    def __init__(self):
+        self.http_session = requests_cache.CachedSession("ia", backend="sqlite")
+        self.cache = SQLiteBackend("http")
+
+    def get(self, url, params=None):
+        return self.http_session.get(url, params)
+
+    async def aget(self, url, params=None):
+        async with CachedSession(cache=SQLiteBackend("http")) as session:
+            async with session.get(url, allow_redirects=True, params=params) as resp:
+                return await resp.text()

+ 90 - 0
src/de_quoi_parle_le_monde/internet_archive.py

@@ -0,0 +1,90 @@
+from attrs import frozen
+from typing import Optional, ClassVar
+from datetime import date, datetime
+import cattrs
+from bs4 import BeautifulSoup
+
+from http_client import HttpClient
+
+
+@frozen
+class CdxRecord:
+    urlkey: str
+    timestamp: int
+    original: str
+    mimetype: str
+    statuscode: int
+    digest: str
+    length: int
+
+    @staticmethod
+    def parse_line(line: str):
+        return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
+
+
+@frozen
+class CdxRequest:
+    url: str
+    filter: Optional[str] = None
+    from_: Optional[date | datetime] = None
+    to_: Optional[date | datetime] = None
+    limit: Optional[int] = None
+
+    translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
+    date_format: ClassVar[str] = "%Y%m%d"
+    datetime_format: ClassVar[str] = "%Y%m%d%H%M%S"
+
+    def into_params(self) -> dict[str, str]:
+        return {
+            self._translate_key(k): self._stringify_value(v)
+            for k, v in cattrs.unstructure(self).items()
+        }
+
+    @classmethod
+    def _translate_key(cls, key: str) -> str:
+        return cls.translation_dict.get(key, key)
+
+    @classmethod
+    def _stringify_value(cls, v) -> str:
+        if isinstance(v, date):
+            return v.strftime(cls.date_format)
+        elif isinstance(v, datetime):
+            return v.strftime(cls.datetime_format)
+        else:
+            return str(v)
+
+
+@frozen
+class InternetArchiveSnapshot:
+    timestamp: str
+    original: str
+
+    @property
+    def url(self):
+        return f"http://web.archive.org/web/{self.timestamp}/{self.original}"
+
+    @staticmethod
+    def from_record(rec: CdxRecord):
+        return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
+
+
+class InternetArchiveClient:
+    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
+
+    def __init__(self):
+        self.client = HttpClient()
+
+    async def search_snapshots(self, req: CdxRequest):
+        def to_snapshot(line):
+            record = CdxRecord.parse_line(line)
+            return InternetArchiveSnapshot.from_record(record)
+
+        resp = await self.client.aget(
+            "http://web.archive.org/cdx/search/cdx?", req.into_params()
+        )
+
+        return [to_snapshot(line) for line in resp.splitlines()]
+
+    async def fetch_and_parse_snapshot(self, snap: InternetArchiveSnapshot):
+        resp = await self.client.aget(snap.url)
+        return BeautifulSoup(resp, "html.parser")

+ 45 - 0
src/de_quoi_parle_le_monde/le_monde.py

@@ -0,0 +1,45 @@
+from attrs import frozen
+import cattrs
+from bs4 import BeautifulSoup
+
+from internet_archive import InternetArchiveSnapshot
+
+
+@frozen
+class LeMondeTopArticle:
+    title: str
+    url: str
+
+    @staticmethod
+    def from_soup(soup: BeautifulSoup):
+        return cattrs.structure(
+            dict(title=soup.text.strip(), url=soup.find("a")["href"]), LeMondeTopArticle
+        )
+
+
+@frozen
+class LeMondeMainArticle:
+    title: str
+    url: str
+
+    @staticmethod
+    def from_soup(soup: BeautifulSoup):
+        attrs = dict(title=soup.find("h1").text.strip(), url=soup.find("a")["href"])
+        return cattrs.structure(attrs, LeMondeMainArticle)
+
+
+@frozen
+class LeMondeMainPage:
+    snapshot: InternetArchiveSnapshot
+    soup: BeautifulSoup
+
+    def get_top_articles(self):
+        return [
+            LeMondeTopArticle.from_soup(s)
+            for s in self.soup.find_all("div", class_="top-article")
+        ]
+
+    def main_article(self):
+        return LeMondeMainArticle.from_soup(
+            self.soup.find("div", class_="article--main")
+        )

+ 3 - 136
src/de_quoi_parle_le_monde/main.py

@@ -1,141 +1,8 @@
-import requests_cache
-from attrs import frozen
-from typing import Optional, ClassVar
-from datetime import date, datetime, timedelta
-import cattrs
-from bs4 import BeautifulSoup
-from aiohttp_client_cache import CachedSession, SQLiteBackend
+from datetime import date, timedelta
 import asyncio
 
-
-@frozen
-class CdxRecord:
-    urlkey: str
-    timestamp: int
-    original: str
-    mimetype: str
-    statuscode: int
-    digest: str
-    length: int
-
-    @staticmethod
-    def parse_line(line: str):
-        return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
-
-
-@frozen
-class CdxRequest:
-    url: str
-    filter: Optional[str] = None
-    from_: Optional[date | datetime] = None
-    to_: Optional[date | datetime] = None
-    limit: Optional[int] = None
-
-    translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
-    date_format: ClassVar[str] = "%Y%m%d"
-    datetime_format: ClassVar[str] = "%Y%m%d%H%M%S"
-
-    def into_params(self) -> dict[str, str]:
-        return {
-            self._translate_key(k): self._stringify_value(v)
-            for k, v in cattrs.unstructure(self).items()
-        }
-
-    @classmethod
-    def _translate_key(cls, key: str) -> str:
-        return cls.translation_dict.get(key, key)
-
-    @classmethod
-    def _stringify_value(cls, v) -> str:
-        if isinstance(v, date):
-            return v.strftime(cls.date_format)
-        elif isinstance(v, datetime):
-            return v.strftime(cls.datetime_format)
-        else:
-            return str(v)
-
-
-class HttpClient:
-    def __init__(self):
-        self.http_session = requests_cache.CachedSession("ia", backend="sqlite")
-        self.cache = SQLiteBackend("http")
-
-    def get(self, url, params=None):
-        return self.http_session.get(url, params)
-
-    async def aget(self, url, params=None):
-        async with CachedSession(cache=SQLiteBackend("http")) as session:
-            async with session.get(url, allow_redirects=True, params=params) as resp:
-                return await resp.text()
-
-
-@frozen
-class InternetArchiveSnapshot:
-    timestamp: str
-    original: str
-
-    @property
-    def url(self):
-        return f"http://web.archive.org/web/{self.timestamp}/{self.original}"
-
-    @staticmethod
-    def from_record(rec: CdxRecord):
-        return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
-
-
-@frozen
-class LeMondeTopArticle:
-    title: str
-    url: str
-
-    @staticmethod
-    def from_soup(soup: BeautifulSoup):
-        return cattrs.structure(dict(title=soup.text.strip(), url=soup.find("a")["href"]), LeMondeTopArticle)
-
-
-@frozen
-class LeMondeMainArticle:
-    title: str
-    url: str
-
-    @staticmethod
-    def from_soup(soup: BeautifulSoup):
-        attrs = dict(title=soup.find("h1").text.strip(), url=soup.find("a")["href"])
-        return cattrs.structure(attrs, LeMondeMainArticle)
-
-
-@frozen
-class LeMondeMainPage:
-    snapshot: InternetArchiveSnapshot
-    soup: BeautifulSoup
-
-    def get_top_articles(self):
-        return [LeMondeTopArticle.from_soup(s) for s in self.soup.find_all("div", class_="top-article")]
-
-    def main_article(self):
-        return LeMondeMainArticle.from_soup(self.soup.find("div", class_="article--main"))
-
-
-class InternetArchiveClient:
-    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
-
-    def __init__(self):
-        self.client = HttpClient()
-
-    async def search_snapshots(self, req: CdxRequest):
-        def to_snapshot(line):
-            record = CdxRecord.parse_line(line)
-            return InternetArchiveSnapshot.from_record(record)
-
-        resp = await self.client.aget(
-            "http://web.archive.org/cdx/search/cdx?", req.into_params()
-        )
-
-        return [to_snapshot(line) for line in resp.splitlines()]
-
-    async def fetch_and_parse_snapshot(self, snap: InternetArchiveSnapshot):
-        resp = await self.client.aget(snap.url)
-        return BeautifulSoup(resp, "html.parser")
+from internet_archive import InternetArchiveClient, CdxRequest
+from le_monde import LeMondeMainPage
 
 
 async def get_latest_snaps():