Jelajahi Sumber

Add media images scraping

theenglishway (time) 7 tahun lalu
induk
melakukan
c9a709e1d8

+ 23 - 2
tests/conftest.py

@@ -8,6 +8,8 @@ from twhatter.client import ClientTimeline
 from twhatter.parser import tweet_factory
 from typing import NamedTuple, List
 
+from twhatter.parser.media import MediaBase
+
 @pytest.fixture
 def cli_runner():
     """Runner for Click"""
@@ -31,6 +33,15 @@ def tweet_limit():
 # Fixtures for extraction of specific tweets of several kinds, whose author
 # and id are known in advance
 
+class MediaInfo(NamedTuple):
+    """Class to hold information about a media that is already known"""
+    image_links: List[str] = []
+
+    def __eq__(self, other):
+        """Override of __eq__ to check against `MediaBase` instance"""
+        return (isinstance(other, MediaBase)
+                and other.image_links == self.image_links)
+
 
 class TweetInfo(NamedTuple):
     """Class to hold information about a tweet that is already known"""
@@ -50,6 +61,7 @@ class TweetInfo(NamedTuple):
     reacted_id: int = None
     reacted_user_id: int = None
     link_to: str = None
+    media: MediaInfo = None
 
 
 @pytest.fixture(scope="session")
@@ -114,7 +126,16 @@ def tweet_collection():
             retweeter="the_english_way",
             comments_nb=12,
             retweets_nb=176,
-            likes_nb=556
+            likes_nb=555
+        ),
+        'media':TweetInfo(
+            id=1086327536726900736,
+            screen_name="the_english_way",
+            user_id=943804775942033408,
+            permalink="/the_english_way/status/1086327536726900736",
+            media=MediaInfo(
+                image_links=["https://pbs.twimg.com/media/DxNof6AXQAAu2oU.jpg"]
+            )
         ),
     }
 
@@ -172,7 +193,7 @@ def user_collection():
             screen_name="Marlene Hansen",
             join_date=datetime(2011, 5, 8, 0, 0),
             tweets_nb=25,
-            following_nb=344,
+            following_nb=342,
             followers_nb=81,
             likes_nb=4
         ),

+ 9 - 2
tests/test_parser.py

@@ -22,6 +22,7 @@ class TestTweet:
         "hashtags",
         "mentions",
         "stats",
+        "media",
     ]
 
     @pytest.mark.parametrize("tweet_type", all_types)
@@ -34,7 +35,7 @@ class TestTweet:
             # of them, the expected values are not set on purpose and therefore
             # not tested
             if value is not None:
-                assert getattr(t, field) == value
+                assert value == getattr(t, field)
 
     @pytest.mark.parametrize("tweet_type,expected_class", [
         ('plain', TweetTextOnly),
@@ -46,6 +47,12 @@ class TestTweet:
         t, tweet_info = tweet_test_data_factory(tweet_type)
         assert isinstance(t, expected_class)
 
+    @pytest.mark.parametrize("media_type,expected_class", [
+        ('media', MediaImage),
+    ])
+    def test_media_type(self, tweet_test_data_factory, media_type, expected_class):
+        t, tweet_info = tweet_test_data_factory(media_type)
+        assert isinstance(t.media, expected_class)
 
 class TestUser:
     all_handles = [
@@ -65,4 +72,4 @@ class TestUser:
             # of them, the expected values are not set on purpose and therefore
             # not tested
             if value is not None:
-                assert getattr(user, field) == value
+                assert value == getattr(user, field)

+ 5 - 1
twhatter/parser/__init__.py

@@ -2,6 +2,7 @@ from .tweet import (TweetList, TweetBase,
                     tweet_factory,
                     TweetTextOnly, TweetLink, TweetReaction, TweetRetweet)
 from .user import user_factory
+from .media import MediaImage, media_factory
 
 __all__= [
     "TweetList",
@@ -12,5 +13,8 @@ __all__= [
     "TweetReaction",
     "TweetRetweet",
 
-    "user_factory"
+    "user_factory",
+
+    "MediaImage",
+    "media_factory"
 ]

+ 58 - 0
twhatter/parser/media.py

@@ -0,0 +1,58 @@
+from datetime import datetime
+
+from bs4 import BeautifulSoup
+from dataclasses import dataclass, fields, InitVar, field
+from typing import List, Optional
+
+from .mixins import ExtractableMixin
+
+
+@dataclass
+class MediaBase(ExtractableMixin):
+    #: Links to images contained in the media
+    image_links: List[str]
+
+    @staticmethod
+    def extract_image_links(soup):
+        if not soup:
+            return []
+
+        try:
+            datakw = "data-image-url"
+            return [
+                div[datakw]
+                for div in soup.find_all('div', attrs={datakw: True})
+            ]
+        except:
+            raise
+
+    #: The soup extracted from the raw HTML
+    soup: InitVar[BeautifulSoup] = None
+
+    #https: // pbs.twimg.com / media / DxNof6AXQAAu2oU.jpg
+
+
+class MediaImage(MediaBase):
+    @staticmethod
+    def condition(kwargs):
+        return kwargs['image_links']
+
+
+
+def media_factory(soup: BeautifulSoup) -> Optional[MediaBase]:
+    """
+    :param soup: the soup extracted from the raw html for that media
+    :return: a well-formatted Media
+    """
+    kwargs = {
+        f.name: MediaBase._extract_value(soup, f) for f in fields(MediaBase)
+    }
+
+    for kls in MediaBase.__subclasses__():
+        try:
+            if kls.condition(kwargs):
+                return kls(soup=soup, **kwargs)
+        except NotImplementedError:
+            continue
+
+    return None

+ 8 - 0
twhatter/parser/tweet.py

@@ -5,6 +5,7 @@ from dataclasses import dataclass, fields, InitVar, field
 from typing import List
 
 from .mixins import ExtractableMixin
+from .media import MediaBase, media_factory
 
 
 @dataclass
@@ -45,6 +46,9 @@ class TweetBase(ExtractableMixin):
     #: Link contained within the tweet
     link_to: str = None
 
+    #: Media that the tweet contains
+    media: MediaBase = None
+
     #: The soup extracted from the raw HTML
     soup: InitVar[BeautifulSoup] = None
 
@@ -172,6 +176,10 @@ class TweetBase(ExtractableMixin):
     def extract_text(soup):
         return soup.find('p', 'tweet-text').text
 
+    @staticmethod
+    def extract_media(soup):
+        return media_factory(soup.find('div', 'AdaptiveMedia'))
+
 
 class TweetTextOnly(TweetBase):
     """An original tweet with only plain text"""