Преглед изворни кода

Add a subclass of BeautifulSoup with extra methods

jherve пре 1 година
родитељ
комит
b263621447
1 измењених фајлова са 39 додато и 5 уклоњено
  1. 39 5
      src/media_observer/article.py

+ 39 - 5
src/media_observer/article.py

@@ -2,7 +2,7 @@ import asyncio
 from abc import ABC, abstractmethod
 from attrs import frozen, field, validators
 import cattrs
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, ResultSet
 from yarl import URL
 from zoneinfo import ZoneInfo
 
@@ -95,25 +95,59 @@ class MainArticle(ABC):
         return cls(article)
 
 
+class MagnificentSoup(BeautifulSoup):
+    def select(self, *args, **kwargs):
+        def to_magnificient(soup):
+            soup.__class__ = MagnificentSoup
+            return soup
+
+        return ResultSet(
+            None, [to_magnificient(s) for s in super().select(*args, **kwargs)]
+        )
+
+    def select_first(self, selector: str) -> "MagnificentSoup":
+        try:
+            soup = self.select(selector)[0]
+            soup.__class__ = MagnificentSoup
+            return soup
+        except IndexError:
+            raise ValueError(f"Could not find {selector}")
+
+    def select_unique(self, selector: str) -> "MagnificentSoup":
+        match self.select(selector):
+            case [soup]:
+                soup.__class__ = MagnificentSoup
+                return soup
+
+            case many_or_zero:
+                raise ValueError(
+                    f"Expected a unique element matching {selector}, found {len(many_or_zero)}"
+                )
+
+    @property
+    def stripped_text(self) -> str:
+        return self.text.strip()
+
+
 @frozen
 class FrontPage(ABC):
     snapshot: InternetArchiveSnapshot
-    soup: BeautifulSoup = field(repr=False)
+    soup: MagnificentSoup = field(repr=False)
     top_articles: list[TopArticle]
     main_article: MainArticle
 
     @staticmethod
     @abstractmethod
-    def get_top_articles(soup: BeautifulSoup) -> list[TopArticle]: ...
+    def get_top_articles(soup: MagnificentSoup) -> list[TopArticle]: ...
 
     @staticmethod
     @abstractmethod
-    def get_main_article(soup: BeautifulSoup) -> MainArticle: ...
+    def get_main_article(soup: MagnificentSoup) -> MainArticle: ...
 
     @classmethod
     async def from_snapshot(cls, snapshot: InternetArchiveSnapshot):
         loop = asyncio.get_event_loop()
-        soup = await loop.run_in_executor(None, BeautifulSoup, snapshot.text, "lxml")
+        soup = await loop.run_in_executor(None, MagnificentSoup, snapshot.text, "lxml")
 
         return cls(
             snapshot, soup, cls.get_top_articles(soup), cls.get_main_article(soup)