Quellcode durchsuchen

Add a snapshot class

jherve vor 1 Jahr
Ursprung
Commit
c87c78f698
1 geänderte Dateien mit 38 neuen und 9 gelöschten Zeilen
  1. 38 9
      src/de_quoi_parle_le_monde/main.py

+ 38 - 9
src/de_quoi_parle_le_monde/main.py

@@ -69,6 +69,29 @@ class HttpClient:
                 return await resp.text()
                 return await resp.text()
 
 
 
 
+@frozen
+class InternetArchiveSnapshot:
+    timestamp: str
+    original: str
+
+    @property
+    def url(self):
+        return f"http://web.archive.org/web/{self.timestamp}/{self.original}"
+
+    @staticmethod
+    def from_record(rec: CdxRecord):
+        return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
+
+
+@frozen
+class LeMondeMainPage:
+    snapshot: InternetArchiveSnapshot
+    soup: BeautifulSoup
+
+    def get_top_articles_titles(self):
+        return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
+
+
 class InternetArchiveClient:
 class InternetArchiveClient:
     # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
     # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
 
 
@@ -76,13 +99,19 @@ class InternetArchiveClient:
         self.client = HttpClient()
         self.client = HttpClient()
 
 
     async def search_snapshots(self, req: CdxRequest):
     async def search_snapshots(self, req: CdxRequest):
+        def to_snapshot(line):
+            record = CdxRecord.parse_line(line)
+            return InternetArchiveSnapshot.from_record(record)
+
         resp = await self.client.aget(
         resp = await self.client.aget(
             "http://web.archive.org/cdx/search/cdx?", req.into_params()
             "http://web.archive.org/cdx/search/cdx?", req.into_params()
         )
         )
-        return [CdxRecord.parse_line(line) for line in resp.splitlines()]
 
 
-    async def get_snapshot(self, url, snap_date):
-        return await self.client.aget(f"http://web.archive.org/web/{snap_date}/{url}")
+        return [to_snapshot(line) for line in resp.splitlines()]
+
+    async def fetch_and_parse_snapshot(self, snap: InternetArchiveSnapshot):
+        resp = await self.client.aget(snap.url)
+        return BeautifulSoup(resp, "html.parser")
 
 
 
 
 class WebPage:
 class WebPage:
@@ -103,15 +132,15 @@ async def get_latest_snaps():
         )
         )
         return ia.search_snapshots(req)
         return ia.search_snapshots(req)
 
 
-    async def get_snap(res):
-        snap = await ia.get_snapshot(res[-1].original, res[-1].timestamp)
-        page = WebPage(snap)
-        return page.get_top_articles_titles()
+    async def parse_snap(snap):
+        soup = await ia.fetch_and_parse_snapshot(snap)
+        return LeMondeMainPage(snap, soup).get_top_articles_titles()
 
 
-    results = await asyncio.gather(*[build_request(d) for d in dates])
-    top = await asyncio.gather(*[get_snap(r) for r in results])
+    snaps = await asyncio.gather(*[build_request(d) for d in dates])
+    top = await asyncio.gather(*[parse_snap(s[0]) for s in snaps])
     for t in top:
     for t in top:
         print(t[0], t[-1])
         print(t[0], t[-1])
+    print([InternetArchiveSnapshot.from_record(r[0]) for r in snaps])
 
 
 
 
 asyncio.run(get_latest_snaps())
 asyncio.run(get_latest_snaps())