Explorar o código

Add basic parsing

jherve hai 1 ano
pai
achega
354cf6104f
Modificáronse 3 ficheiros con 44 adicións e 5 borrados
  1. 26 1
      pdm.lock
  2. 1 0
      pyproject.toml
  3. 17 4
      src/de_quoi_parle_le_monde/__main__.py

+ 26 - 1
pdm.lock

@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["cross_platform", "inherit_metadata"]
 lock_version = "4.4.1"
-content_hash = "sha256:7a82f05c6866efd62cdd35b6f4b656dfc06ff5871ca86dc54e395ff79cb4cecb"
+content_hash = "sha256:d55e0b94e88214e9010d405930d5992c0b9788c107bd9c8557f34b737a612b3a"
 
 [[package]]
 name = "attrs"
@@ -18,6 +18,20 @@ files = [
     {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
 ]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+requires_python = ">=3.6.0"
+summary = "Screen-scraping library"
+groups = ["default"]
+dependencies = [
+    "soupsieve>1.2",
+]
+files = [
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
 [[package]]
 name = "cattrs"
 version = "23.2.3"
@@ -138,6 +152,17 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.5"
+requires_python = ">=3.8"
+summary = "A modern CSS selector implementation for Beautiful Soup."
+groups = ["default"]
+files = [
+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
 [[package]]
 name = "url-normalize"
 version = "1.4.3"

+ 1 - 0
pyproject.toml

@@ -8,6 +8,7 @@ authors = [
 dependencies = [
     "requests>=2.31.0",
     "requests-cache>=1.2.0",
+    "beautifulsoup4>=4.12.3",
 ]
 requires-python = "==3.11.*"
 readme = "README.md"

+ 17 - 4
src/de_quoi_parle_le_monde/__main__.py

@@ -2,6 +2,7 @@ import requests
 import requests_cache
 from requests_cache.models.response import CachedResponse
 from requests_cache.backends.sqlite import SQLiteCache
+from bs4 import BeautifulSoup
 
 http_session = requests_cache.CachedSession("ia",backend="sqlite")
 
@@ -29,10 +30,22 @@ class InternetArchive:
 
     @staticmethod
     def get_snapshot(url, snap_date):
-        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}")
+        return http_session.get(f"http://web.archive.org/web/{snap_date}/{url}", headers={"lang": "fr"})
 
-r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
 
-results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
+class WebPage:
+    def __init__(self, doc):
+        self.soup = BeautifulSoup(doc, 'html.parser')
+
+    def get_top_articles_titles(self):
+        return [s.text.strip() for s in w.soup.find_all("div", class_="top-article")]
+
+def get_latest_snap():
+    r = InternetArchive.search_snapshots("lemonde.fr", {"from": "20240222", "to": "20240222", "limit": 10})
+    results = [InternetArchive.parse_results(r) for r in r.text.splitlines()]
+
+    return InternetArchive.get_snapshot(*results[-1])
+
+
+print(WebPage(get_latest_snap().text).get_top_articles_titles())
 
-print(InternetArchive.get_snapshot(*results[0]))