Sfoglia il codice sorgente

Add a top article class

jherve 1 anno fa
parent
commit
e4df3419ea
1 ha cambiato i file con 14 aggiunte e 13 eliminazioni
  1. 14 13
      src/de_quoi_parle_le_monde/main.py

+ 14 - 13
src/de_quoi_parle_le_monde/main.py

@@ -83,13 +83,23 @@ class InternetArchiveSnapshot:
         return InternetArchiveSnapshot(timestamp=rec.timestamp, original=rec.original)
 
 
+@frozen
+class LeMondeTopArticle:
+    title: str
+    url: str
+
+    @staticmethod
+    def from_soup(soup: BeautifulSoup):
+        return cattrs.structure(dict(title=soup.text.strip(), url=soup.find("a")["href"]), LeMondeTopArticle)
+
+
 @frozen
 class LeMondeMainPage:
     snapshot: InternetArchiveSnapshot
     soup: BeautifulSoup
 
-    def get_top_articles_titles(self):
-        return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
+    def get_top_articles(self):
+        return [LeMondeTopArticle.from_soup(s) for s in self.soup.find_all("div", class_="top-article")]
 
 
 class InternetArchiveClient:
@@ -114,14 +124,6 @@ class InternetArchiveClient:
         return BeautifulSoup(resp, "html.parser")
 
 
-class WebPage:
-    def __init__(self, doc):
-        self.soup = BeautifulSoup(doc, "html.parser")
-
-    def get_top_articles_titles(self):
-        return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
-
-
 async def get_latest_snaps():
     dates = [date.today() - timedelta(days=n) for n in range(0, 10)]
     ia = InternetArchiveClient()
@@ -134,13 +136,12 @@ async def get_latest_snaps():
 
     async def parse_snap(snap):
         soup = await ia.fetch_and_parse_snapshot(snap)
-        return LeMondeMainPage(snap, soup).get_top_articles_titles()
+        return LeMondeMainPage(snap, soup)
 
     snaps = await asyncio.gather(*[build_request(d) for d in dates])
     top = await asyncio.gather(*[parse_snap(s[0]) for s in snaps])
     for t in top:
-        print(t[0], t[-1])
-    print([InternetArchiveSnapshot.from_record(r[0]) for r in snaps])
+        print(t.get_top_articles()[0], t.get_top_articles()[-1])
 
 
 asyncio.run(get_latest_snaps())