1 年之前 · 354cf6104f
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
 
				 groups = ["default"]
			
 
				 strategy = ["cross_platform", "inherit_metadata"]
			
 
				 lock_version = "4.4.1"
			
 
				-content_hash = "sha256:7a82f05c6866efd62cdd35b6f4b656dfc06ff5871ca86dc54e395ff79cb4cecb"
			
 
				+content_hash = "sha256:d55e0b94e88214e9010d405930d5992c0b9788c107bd9c8557f34b737a612b3a"
			
 
				 
			
 
				 [[package]]
			
 
				 name = "attrs"
			
@@ -18,6 +18,20 @@ files = [
 
				     {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "beautifulsoup4"
			
 
				+version = "4.12.3"
			
 
				+requires_python = ">=3.6.0"
			
 
				+summary = "Screen-scraping library"
			
 
				+groups = ["default"]
			
 
				+dependencies = [
			
 
				+    "soupsieve>1.2",
			
 
				+]
			
 
				+files = [
			
 
				+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
			
 
				+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "cattrs"
			
 
				 version = "23.2.3"
			
@@ -138,6 +152,17 @@ files = [
 
				     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "soupsieve"
			
 
				+version = "2.5"
			
 
				+requires_python = ">=3.8"
			
 
				+summary = "A modern CSS selector implementation for Beautiful Soup."
			
 
				+groups = ["default"]
			
 
				+files = [
			
 
				+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
			
 
				+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "url-normalize"
			
 
				 version = "1.4.3"
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ authors = [
 
				 dependencies = [
			
 
				     "requests>=2.31.0",
			
 
				     "requests-cache>=1.2.0",
			
 
				+    "beautifulsoup4>=4.12.3",
			
 
				 ]
			
 
				 requires-python = "==3.11.*"
			
 
				 readme = "README.md"
			
--- a/src/de_quoi_parle_le_monde/__main__.py
+++ b/src/de_quoi_parle_le_monde/__main__.py
@@ -2,6 +2,7 @@ import requests
 
				 import requests_cache
			
 
				 from requests_cache.models.response import CachedResponse
			
 
				 from requests_cache.backends.sqlite import SQLiteCache
			
 
				+from bs4 import BeautifulSoup
			
 
				 
			
 
				 http_session = requests_cache.CachedSession("ia",backend="sqlite")
			
 
				 
			
@@ -29,10 +30,22 @@ class InternetArchive:
 
				 
			
 
				     @staticmethod
			
 
				     def get_snapshot(url, snap_date):
			
 
				-        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}")
			
 
				+        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}", headers={"lang": "fr"})
			
 
				 
			
 
				-r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
			
 
				 
			
 
				-results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
			
 
				+class WebPage:
			
 
				+    def __init__(self, doc):
			
 
				+        self.soup = BeautifulSoup(doc, 'html.parser')
			
 
				+
			
 
				+    def get_top_articles_titles(self):
			
 
				+        return [s.text.strip() for s in w.soup.find_all("div", class_="top-article")]
			
 
				+
			
 
				+def get_latest_snap():
			
 
				+    r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
			
 
				+    results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
			
 
				+
			
 
				+    return InternetArchive.get_snapshot(*results[-1])
			
 
				+
			
 
				+
			
 
				+print(WebPage(get_latest_snap().text).get_top_articles_titles())
			
 
				 
			
 
				-print(InternetArchive.get_snapshot(*results[0]))