Преглед изворни кода

Reimplement parsing for all medias

jherve пре 1 година
родитељ
комит
6a2b6ad2aa

+ 4 - 9
src/media_observer/medias/bfmtv.py

@@ -2,7 +2,6 @@ from media_observer.article import (
     TopArticle,
     MainArticle,
     FrontPage,
-    to_text,
 )
 
 
@@ -12,7 +11,7 @@ class BfmTvFrontPage(FrontPage):
         all_articles = soup.select("section[id*='top_contenus'] li > a")
         return [
             TopArticle.create(
-                title=to_text(a, "h3"),
+                title=a.select_unique("h3").stripped_text,
                 url=a["href"],
                 rank=idx + 1,
             )
@@ -21,12 +20,8 @@ class BfmTvFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        def to_href(soup):
-            link = soup.select("a")[0]
-            return link["href"]
-
-        [main] = soup.select("article.une_item")
+        main = soup.select_unique("article.une_item")
         return MainArticle.create(
-            title=to_text(main, "h2.title_une_item"),
-            url=to_href(main),
+            title=main.select_unique("h2.title_une_item").stripped_text,
+            url=main.select_first("a")["href"],
         )

+ 4 - 5
src/media_observer/medias/cnews.py

@@ -4,7 +4,6 @@ from media_observer.article import (
     TopArticle,
     MainArticle,
     FrontPage,
-    to_text,
 )
 
 
@@ -15,7 +14,7 @@ class CNewsFrontPage(FrontPage):
 
         return [
             TopArticle.create(
-                title=to_text(a, "h3.dm-letop-title"),
+                title=a.select_unique("h3.dm-letop-title").stripped_text,
                 url=a["href"],
                 rank=idx + 1,
             )
@@ -24,10 +23,10 @@ class CNewsFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        main = soup.select("div.dm-block")[0]
-        [url] = main.select("a")
+        main = soup.select_first("div.dm-block")
+        url = main.select_unique("a")
 
         return MainArticle.create(
-            title=to_text(main, "h2.dm-news-title"),
+            title=main.select_unique("h2.dm-news-title").stripped_text,
             url=url["href"],
         )

+ 15 - 22
src/media_observer/medias/france_tv_info.py

@@ -2,23 +2,18 @@ from media_observer.article import (
     TopArticle,
     MainArticle,
     FrontPage,
-    to_text,
 )
 
 
 class FranceTvInfoFrontPage(FrontPage):
     @staticmethod
     def get_top_articles(soup):
-        def to_href(article, selector):
-            [url] = article.select(selector)
-            return url["href"]
-
         all_articles = soup.select("article.card-article-most-read")
 
         return [
             TopArticle.create(
-                title=to_text(a, "p.card-article-most-read__title"),
-                url=to_href(a, "a"),
+                title=a.select_unique("p.card-article-most-read__title").stripped_text,
+                url=a.select_unique("a")["href"],
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -26,20 +21,18 @@ class FranceTvInfoFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        def select_first_of(soup, *selectors):
-            for s in selectors:
-                if found := soup.select(s):
-                    return found
-            return None
+        def get_kwargs(main_selector, title_selector):
+            main = soup.select_unique(main_selector)
+            title = main.select_unique(title_selector)
+            return dict(title=title.stripped_text, url=main.select_unique("a")["href"])
 
-        [main] = select_first_of(
-            soup, "article.card-article-majeure", "article.card-article-actu-forte"
-        )
-        [title] = select_first_of(
-            main, ".card-article-majeure__title", ".card-article-actu-forte__title"
-        )
+        try:
+            kwargs = get_kwargs(
+                "article.card-article-majeure", ".card-article-majeure__title"
+            )
+        except ValueError:
+            kwargs = get_kwargs(
+                "article.card-article-actu-forte", ".card-article-actu-forte__title"
+            )
 
-        return MainArticle.create(
-            title=title.text.strip(),
-            url=main.find("a")["href"],
-        )
+        return MainArticle.create(**kwargs)

+ 3 - 4
src/media_observer/medias/le_figaro.py

@@ -3,7 +3,6 @@ from bs4 import BeautifulSoup
 from media_observer.article import (
     MainArticle,
     FrontPage,
-    to_text,
 )
 
 
@@ -16,10 +15,10 @@ class LeFigaroFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        main = soup.select(".fig-main .fig-ensemble__first-article")[0]
-        url = main.select("a")[0]
+        main = soup.select_first(".fig-main .fig-ensemble__first-article")
+        url = main.select_first("a")
 
         return MainArticle.create(
-            title=to_text(main, ".fig-ensemble__title"),
+            title=main.select_unique(".fig-ensemble__title").stripped_text,
             url=url["href"],
         )

+ 5 - 10
src/media_observer/medias/le_monde.py

@@ -2,7 +2,6 @@ from media_observer.article import (
     TopArticle,
     MainArticle,
     FrontPage,
-    to_text,
 )
 
 
@@ -12,8 +11,8 @@ class LeMondeFrontPage(FrontPage):
         all_articles = soup.select("div.top-article")
         return [
             TopArticle.create(
-                title=a.text.strip(),
-                url=a.find("a")["href"],
+                title=a.stripped_text,
+                url=a.select_unique("a")["href"],
                 rank=idx + 1,
             )
             for idx, a in enumerate(all_articles)
@@ -21,12 +20,8 @@ class LeMondeFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        def to_href(soup):
-            link = soup.select("a")[0]
-            return link["href"]
-
-        [main] = soup.select("div.article--main")
+        main = soup.select_unique("div.article--main")
         return MainArticle.create(
-            title=to_text(main, "p.article__title-label"),
-            url=to_href(main),
+            title=main.select_unique("p.article__title-label").stripped_text,
+            url=main.select_first("a")["href"],
         )

+ 4 - 4
src/media_observer/medias/le_parisien.py

@@ -14,7 +14,7 @@ class LeParisienFrontPage(FrontPage):
 
         return [
             TopArticle.create(
-                title=a.text.strip(),
+                title=a.stripped_text,
                 url=a["href"],
                 rank=idx + 1,
             )
@@ -23,10 +23,10 @@ class LeParisienFrontPage(FrontPage):
 
     @staticmethod
     def get_main_article(soup):
-        main = soup.select(".homepage__top article")[0]
-        url = main.select("a")[0]
+        main = soup.select_first(".homepage__top article")
+        url = main.select_first("a")
 
         return MainArticle.create(
-            title=url.text.strip(),
+            title=url.stripped_text,
             url=url["href"],
         )

+ 5 - 5
src/media_observer/medias/tf1_info.py

@@ -19,19 +19,19 @@ class Tf1InfoFrontPage(FrontPage):
 
     @staticmethod
     def _get_top_article(soup: BeautifulSoup, idx: int):
-        [a] = soup.select("a")
+        a = soup.select_unique("a")
         return TopArticle.create(
-            title=a.text.strip(),
+            title=a.stripped_text,
             url=a["href"],
             rank=idx + 1,
         )
 
     @staticmethod
     def get_main_article(soup):
-        main = soup.select("#headlineid .ArticleCard__Title")[0]
-        [url] = main.select("a")
+        main = soup.select_first("#headlineid .ArticleCard__Title")
+        url = main.select_unique("a")
 
         return MainArticle.create(
-            title=url.text.strip(),
+            title=url.stripped_text,
             url=url["href"],
         )