|
|
@@ -1,5 +1,8 @@
|
|
|
import requests
|
|
|
import requests_cache
|
|
|
+from attrs import frozen
|
|
|
+from typing import Optional, ClassVar
|
|
|
+import cattrs
|
|
|
from requests_cache.models.response import CachedResponse
|
|
|
from requests_cache.backends.sqlite import SQLiteCache
|
|
|
from bs4 import BeautifulSoup
|
|
|
@@ -7,30 +10,49 @@ from bs4 import BeautifulSoup
|
|
|
http_session = requests_cache.CachedSession("ia",backend="sqlite")
|
|
|
|
|
|
|
|
|
-class InternetArchive:
|
|
|
- # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
|
|
+@frozen
|
|
|
+class CdxRecord:
|
|
|
+ urlkey: str
|
|
|
+ timestamp: int
|
|
|
+ original: str
|
|
|
+ mimetype: str
|
|
|
+ statuscode: int
|
|
|
+ digest: str
|
|
|
+ length:int
|
|
|
|
|
|
@staticmethod
|
|
|
- def search_snapshots(url: str, params: dict):
|
|
|
- return http_session.get("http://web.archive.org/cdx/search/cdx?", {"url": url} | {"filter": "statuscode:200"} | params)
|
|
|
+ def parse_line(line: str):
|
|
|
+ return cattrs.structure_attrs_fromtuple(line.split(" "), CdxRecord)
|
|
|
+
|
|
|
+
|
|
|
+@frozen
|
|
|
+class CdxRequest:
|
|
|
+ url: str
|
|
|
+ filter: Optional[str] = None
|
|
|
+ from_: Optional[str] = None
|
|
|
+ to_: Optional[str] = None
|
|
|
+ limit: Optional[int] = None
|
|
|
+ translation_dict: ClassVar[dict] = dict(from_="from", to_="to")
|
|
|
+
|
|
|
+ def into_params(self):
|
|
|
+ return {self._translate_key(k): v for k, v in cattrs.unstructure(self).items()}
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _translate_key(cls, key):
|
|
|
+ return cls.translation_dict.get(key, key)
|
|
|
+
|
|
|
+
|
|
|
+class InternetArchive:
|
|
|
+ # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
|
|
|
|
|
@staticmethod
|
|
|
- def parse_results(line: str):
|
|
|
- [
|
|
|
- id_,
|
|
|
- snap_date,
|
|
|
- url,
|
|
|
- mimetype,
|
|
|
- statuscode,
|
|
|
- hash_,
|
|
|
- size
|
|
|
- ] = line.split(" ")
|
|
|
-
|
|
|
- return snap_date, url
|
|
|
+ def search_snapshots(req: CdxRequest):
|
|
|
+ resp = http_session.get("http://web.archive.org/cdx/search/cdx?", req.into_params())
|
|
|
+ return [CdxRecord.parse_line(l) for l in resp.text.splitlines()]
|
|
|
|
|
|
@staticmethod
|
|
|
def get_snapshot(url, snap_date):
|
|
|
- return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}", headers={"lang": "fr"})
|
|
|
+ return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}")
|
|
|
|
|
|
|
|
|
class WebPage:
|
|
|
@@ -38,14 +60,15 @@ class WebPage:
|
|
|
self.soup = BeautifulSoup(doc, 'html.parser')
|
|
|
|
|
|
def get_top_articles_titles(self):
|
|
|
- return [s.text.strip() for s in w.soup.find_all("div", class_="top-article")]
|
|
|
+ return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
|
|
|
|
|
|
def get_latest_snap():
|
|
|
- r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
|
|
|
- results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
|
|
|
+ req = CdxRequest(url="lemonde.fr", from_="20240222", to_="20240222", limit=10, filter="statuscode:200")
|
|
|
+ results = InternetArchive.search_snapshots(req)
|
|
|
|
|
|
- return InternetArchive.get_snapshot(*results[-1])
|
|
|
+ latest = results[-1]
|
|
|
+ print(latest)
|
|
|
+ return InternetArchive.get_snapshot(latest.original, latest.timestamp)
|
|
|
|
|
|
|
|
|
print(WebPage(get_latest_snap().text).get_top_articles_titles())
|
|
|
-
|