|
|
@@ -1,14 +1,11 @@
|
|
|
-import requests
|
|
|
import requests_cache
|
|
|
from attrs import frozen
|
|
|
from typing import Optional, ClassVar
|
|
|
from datetime import date, datetime
|
|
|
import cattrs
|
|
|
-from requests_cache.models.response import CachedResponse
|
|
|
-from requests_cache.backends.sqlite import SQLiteCache
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
-http_session = requests_cache.CachedSession("ia",backend="sqlite")
|
|
|
+http_session = requests_cache.CachedSession("ia", backend="sqlite")
|
|
|
|
|
|
|
|
|
@frozen
|
|
|
@@ -19,7 +16,7 @@ class CdxRecord:
|
|
|
mimetype: str
|
|
|
statuscode: int
|
|
|
digest: str
|
|
|
- length:int
|
|
|
+ length: int
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_line(line: str):
|
|
|
@@ -39,7 +36,10 @@ class CdxRequest:
|
|
|
datetime_format: ClassVar[str] = "%Y%m%d%H%M%S"
|
|
|
|
|
|
def into_params(self) -> dict[str, str]:
|
|
|
- return {self._translate_key(k): self._stringify_value(v) for k, v in cattrs.unstructure(self).items()}
|
|
|
+ return {
|
|
|
+ self._translate_key(k): self._stringify_value(v)
|
|
|
+ for k, v in cattrs.unstructure(self).items()
|
|
|
+ }
|
|
|
|
|
|
@classmethod
|
|
|
def _translate_key(cls, key: str) -> str:
|
|
|
@@ -60,8 +60,10 @@ class InternetArchive:
|
|
|
|
|
|
@staticmethod
|
|
|
def search_snapshots(req: CdxRequest):
|
|
|
- resp = http_session.get("http://web.archive.org/cdx/search/cdx?", req.into_params())
|
|
|
- return [CdxRecord.parse_line(l) for l in resp.text.splitlines()]
|
|
|
+ resp = http_session.get(
|
|
|
+ "http://web.archive.org/cdx/search/cdx?", req.into_params()
|
|
|
+ )
|
|
|
+ return [CdxRecord.parse_line(line) for line in resp.text.splitlines()]
|
|
|
|
|
|
@staticmethod
|
|
|
def get_snapshot(url, snap_date):
|
|
|
@@ -70,13 +72,20 @@ class InternetArchive:
|
|
|
|
|
|
class WebPage:
|
|
|
def __init__(self, doc):
|
|
|
- self.soup = BeautifulSoup(doc, 'html.parser')
|
|
|
+ self.soup = BeautifulSoup(doc, "html.parser")
|
|
|
|
|
|
def get_top_articles_titles(self):
|
|
|
return [s.text.strip() for s in self.soup.find_all("div", class_="top-article")]
|
|
|
|
|
|
+
|
|
|
def get_latest_snap():
|
|
|
- req = CdxRequest(url="lemonde.fr", from_=date.today(), to_=date.today(), limit=10, filter="statuscode:200")
|
|
|
+ req = CdxRequest(
|
|
|
+ url="lemonde.fr",
|
|
|
+ from_=date.today(),
|
|
|
+ to_=date.today(),
|
|
|
+ limit=10,
|
|
|
+ filter="statuscode:200",
|
|
|
+ )
|
|
|
results = InternetArchive.search_snapshots(req)
|
|
|
|
|
|
latest = results[-1]
|