Browse Source

Add InternetArchiveSnapshot class

jherve 1 year ago
parent
commit
a9bb24bd89

+ 9 - 3
src/de_quoi_parle_le_monde/internet_archive.py

@@ -83,6 +83,12 @@ class InternetArchiveSnapshotId:
         return InternetArchiveSnapshotId(timestamp=rec.timestamp, original=rec.original)
 
 
+@frozen
+class InternetArchiveSnapshot:
+    id: InternetArchiveSnapshotId
+    text: str
+
+
 @frozen
 class InternetArchiveClient:
     # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
@@ -100,9 +106,9 @@ class InternetArchiveClient:
 
         return [to_snapshot_id(line) for line in resp.splitlines()]
 
-    async def fetch(self, snap: InternetArchiveSnapshotId) -> str:
-        resp = await self.session.get(snap.url)
-        return resp
+    async def fetch(self, id_: InternetArchiveSnapshotId) -> str:
+        resp = await self.session.get(id_.url)
+        return InternetArchiveSnapshot(id_, resp)
 
     async def get_snapshot_id_closest_to(self, url, dt):
         req = CdxRequest(

+ 5 - 7
src/de_quoi_parle_le_monde/le_monde.py

@@ -4,7 +4,7 @@ import cattrs
 import asyncio
 from bs4 import BeautifulSoup
 
-from de_quoi_parle_le_monde.internet_archive import InternetArchiveSnapshotId
+from de_quoi_parle_le_monde.internet_archive import InternetArchiveSnapshot
 
 
 @frozen
@@ -35,7 +35,7 @@ class LeMondeMainArticle:
 
 @frozen
 class LeMondeMainPage:
-    snapshot_id: InternetArchiveSnapshotId
+    snapshot: InternetArchiveSnapshot
     soup: BeautifulSoup
 
     def get_top_articles(self):
@@ -50,12 +50,10 @@ class LeMondeMainPage:
         )
 
     @staticmethod
-    async def from_content(
-        snapshot_id: InternetArchiveSnapshotId, text: str
-    ) -> "LeMondeMainPage":
+    async def from_snapshot(snapshot: InternetArchiveSnapshot) -> "LeMondeMainPage":
         loop = asyncio.get_event_loop()
-        soup = await loop.run_in_executor(None, BeautifulSoup, text, "lxml")
-        return LeMondeMainPage(snapshot_id, soup)
+        soup = await loop.run_in_executor(None, BeautifulSoup, snapshot.text, "lxml")
+        return LeMondeMainPage(snapshot, soup)
 
 
 @frozen

+ 4 - 4
src/de_quoi_parle_le_monde/main.py

@@ -24,15 +24,15 @@ class ArchiveDownloader:
 
             async def handle_snap(dt):
                 id_closest = await ia.get_snapshot_id_closest_to(LeMondeArchive.url, dt)
-                closest_body = await ia.fetch(id_closest)
-                return await LeMondeMainPage.from_content(id_closest, closest_body)
+                closest = await ia.fetch(id_closest)
+                return await LeMondeMainPage.from_snapshot(closest)
 
             return await asyncio.gather(*[handle_snap(d) for d in dts])
 
 
 http_client = HttpClient()
 dler = ArchiveDownloader(http_client)
-snaps = asyncio.run(dler.get_latest_snaps(ArchiveDownloader.last_n_days(5)))
+snaps = asyncio.run(dler.get_latest_snaps(ArchiveDownloader.last_n_days(1)))
 
 for s in snaps:
-    print(s.snapshot.timestamp, s.get_top_articles()[0], s.main_article())
+    print(s.snapshot.id.timestamp, s.get_top_articles()[0], s.main_article())