Browse Source

Define some data structures with attrs/cattrs

jherve 1 năm trước cách đây
mục cha
commit
b38e4450e8
3 tập tin đã thay đổi với 48 bổ sung23 xóa
  1. 1 1
      pdm.lock
  2. 2 0
      pyproject.toml
  3. 45 22
      src/de_quoi_parle_le_monde/main.py

+ 1 - 1
pdm.lock

@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:d55e0b94e88214e9010d405930d5992c0b9788c107bd9c8557f34b737a612b3a"
+content_hash = "sha256:eb95be79209782a40de47afccdae15ae1b9e1612e725344cd3617f58960106cd"
 
 [[package]]
 name = "attrs"

+ 2 - 0
pyproject.toml

@@ -9,6 +9,8 @@ dependencies = [
     "requests>=2.31.0",
     "requests-cache>=1.2.0",
     "beautifulsoup4>=4.12.3",
+    "attrs>=23.2.0",
+    "cattrs>=23.2.3",
 ]
 requires-python = "==3.11.*"
 readme = "README.md"

+ 45 - 22
src/de_quoi_parle_le_monde/main.py

@@ -1,5 +1,8 @@
 import requests
 import requests_cache
+from attrs import frozen
+from typing import Optional, ClassVar
+import cattrs
 from requests_cache.models.response import CachedResponse
 from requests_cache.backends.sqlite import SQLiteCache
 from bs4 import BeautifulSoup
@@ -7,30 +10,49 @@ from bs4 import BeautifulSoup
 http_session = requests_cache.CachedSession("ia",backend="sqlite")
 
 
-class InternetArchive:
-    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
+@frozen
+class CdxRecord:
+    urlkey: str
+    timestamp: int
+    original: str
+    mimetype: str
+    statuscode: int
+    digest: str
+    length:int
 
     @staticmethod
-    def search_snapshots(url: str, params: dict):
-        return http_session.get("http://web.archive.org/cdx/search/cdx?", {"url": url} | {"filter": "statuscode:200"} | params)
+    def parse_line(line: str):
+        return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
+
+
+@frozen
+class CdxRequest:
+    url: str
+    filter: Optional[str] = None
+    from_: Optional[str] = None
+    to_: Optional[str] = None
+    limit: Optional[int] = None
+    translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
+
+    def into_params(self):
+        return {self._translate_key(k): v for k, v in cattrs.unstructure(self).items()}
+
+    @classmethod
+    def _translate_key(cls, key):
+        return cls.translation_dict.get(key, key)
+
+
+class InternetArchive:
+    # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
 
     @staticmethod
-    def parse_results(line: str):
-        [
-            id_,
-            snap_date,
-            url,
-            mimetype,
-            statuscode,
-            hash_,
-            size
-        ] = line.split(" ")
-
-        return snap_date, url
+    def search_snapshots(req: CdxRequest):
+        resp = http_session.get("http://web.archive.org/cdx/search/cdx?", req.into_params())
+        return [CdxRecord.parse_line(l) for l in resp.text.splitlines()]
 
     @staticmethod
     def get_snapshot(url, snap_date):
-        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}", headers={"lang": "fr"})
+        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}")
 
 
 class WebPage:
@@ -38,14 +60,15 @@ class WebPage:
         self.soup = BeautifulSoup(doc, 'html.parser')
 
     def get_top_articles_titles(self):
-        return [s.text.strip() for s in w.soup.find_all("div", class_="top-article")]
+        return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
 
 def get_latest_snap():
-    r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
-    results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
+    req = CdxRequest(url="lemonde.fr", from_="20240222", to_="20240222", limit=10, filter="statuscode:200")
+    results = InternetArchive.search_snapshots(req)
 
-    return InternetArchive.get_snapshot(*results[-1])
+    latest = results[-1]
+    print(latest)
+    return InternetArchive.get_snapshot(latest.original, latest.timestamp)
 
 
 print(WebPage(get_latest_snap().text).get_top_articles_titles())
-