浏览代码

Add get_snapshot_closest_to method

jherve 1 年之前
父节点
当前提交
ef2331a152
共有 2 个文件被更改,包括 36 次插入19 次删除
  1. 13 1
      src/de_quoi_parle_le_monde/internet_archive.py
  2. 23 18
      src/de_quoi_parle_le_monde/main.py

+ 13 - 1
src/de_quoi_parle_le_monde/internet_archive.py

@@ -1,6 +1,6 @@
 from attrs import frozen
 from typing import Optional, ClassVar, NewType
-from datetime import date, datetime
+from datetime import date, datetime, timedelta
 import cattrs
 from bs4 import BeautifulSoup
 
@@ -104,3 +104,15 @@ class InternetArchiveClient:
     ) -> BeautifulSoup:
         resp = await self.client.aget(snap.url)
         return BeautifulSoup(resp, "lxml")
+
+    async def get_snapshot_closest_to(self, url, dt):
+        req = CdxRequest(
+            url=url,
+            from_=dt - timedelta(hours=6.0),
+            to_=dt + timedelta(hours=6.0),
+            filter="statuscode:200",
+        )
+
+        all_snaps = await self.search_snapshots(req)
+        closest = min(all_snaps, key=lambda s: abs(s.timestamp - dt))
+        return closest

+ 23 - 18
src/de_quoi_parle_le_monde/main.py

@@ -1,32 +1,37 @@
-from datetime import date, timedelta
+from datetime import date, datetime, time, timedelta
 import asyncio
+from attrs import frozen
 
 from de_quoi_parle_le_monde.http import HttpClient
-from de_quoi_parle_le_monde.internet_archive import InternetArchiveClient, CdxRequest
+from de_quoi_parle_le_monde.internet_archive import InternetArchiveClient
 from de_quoi_parle_le_monde.le_monde import LeMondeArchive, LeMondeMainPage
 
 
-async def get_latest_snaps(dates):
+async def get_latest_snaps(dts):
     http_client = HttpClient()
     ia = InternetArchiveClient(http_client)
 
-    async def req_and_parse_first_snap(date):
-        req = CdxRequest(
-            url=LeMondeArchive.url,
-            from_=date,
-            to_=date,
-            limit=10,
-            filter="statuscode:200",
-        )
-        snaps = await ia.search_snapshots(req)
-        snap = snaps[0]
-        soup = await ia.fetch_and_parse_snapshot(snap)
-        return LeMondeMainPage(snap, soup)
+    async def req_and_parse_first_snap(dt):
+        closest = await ia.get_snapshot_closest_to(LeMondeArchive.url, dt)
+        closest_content = await ia.fetch_and_parse_snapshot(closest)
+        return LeMondeMainPage(closest, closest_content)
 
-    return await asyncio.gather(*[req_and_parse_first_snap(d) for d in dates])
+    return await asyncio.gather(*[req_and_parse_first_snap(d) for d in dts])
 
 
-dates = [date.today() - timedelta(days=n) for n in range(0, 10)]
-snaps = asyncio.run(get_latest_snaps(dates))
+@frozen
+class ArchiveDownloader:
+    client: InternetArchiveClient
+
+    @staticmethod
+    def from_http_client(http_client):
+        return ArchiveDownloader(InternetArchiveClient(http_client))
+
+
+dts = [
+    datetime.combine(date.today() - timedelta(days=n), time(hour=18))
+    for n in range(0, 5)
+]
+snaps = asyncio.run(get_latest_snaps(dts))
 for s in snaps:
     print(s.snapshot.timestamp, s.get_top_articles()[0], s.main_article())