Kaynağa Gözat

Handle user page parsing

theenglishway (time) 7 yıl önce
ebeveyn
işleme
4e15ebe198

+ 26 - 0
tests/conftest.py

@@ -146,3 +146,29 @@ def tweet_test_data_factory(raw_tweet_factory, tweet_collection):
         return tweet_factory(raw_tweet), tweet_info
 
     return _tweet_test_data_factory
+
+
+class UserInfo(NamedTuple):
+    """Class to hold information about an user that is already known"""
+    id: int
+    screen_name: str
+    join_date: datetime
+    tweets_nb: int
+    following_nb: int
+    followers_nb: int
+    likes_nb: int
+
+
+@pytest.fixture(scope="session")
+def user_collection():
+    return {
+        'Marlene_beadles': UserInfo(
+            id=295177446,
+            screen_name="Marlene Hansen",
+            join_date=datetime(2011, 5, 8, 0, 0),
+            tweets_nb=25,
+            following_nb=344,
+            followers_nb=81,
+            likes_nb=4
+        ),
+    }

+ 15 - 0
tests/test_parser.py

@@ -45,3 +45,18 @@ class TestTweet:
     def test_tweet_type(self, tweet_test_data_factory, tweet_type, expected_class):
         t, tweet_info = tweet_test_data_factory(tweet_type)
         assert isinstance(t, expected_class)
+
+
+class TestUser:
+    all_handles = [
+        "Marlene_beadles"
+    ]
+
+    @pytest.mark.parametrize("user_handle", all_handles)
+    def test_user(self, raw_html_user_initial_page_factory, user_collection, user_handle):
+        user_info = user_collection[user_handle]
+        raw_user = raw_html_user_initial_page_factory(user_handle)
+        user = user_factory(raw_user)
+
+        for field, value in user_info._asdict().items():
+            assert getattr(user, field) == value

+ 1 - 1
twhatter/api.py

@@ -32,7 +32,7 @@ class ApiUser(Api):
         url = "https://twitter.com/{}".format(self.user)
         return requests.get(
             url,
-            headers={'User-Agent': choice(self.HEADERS_LIST)}
+            headers={'User-Agent': choice(self.HEADERS_LIST), 'Accept-Language': 'en'}
         )
 
     def get_more_tweets(self):

+ 4 - 1
twhatter/parser/__init__.py

@@ -1,6 +1,7 @@
 from .tweet import (TweetList, TweetBase,
                     tweet_factory,
                     TweetTextOnly, TweetLink, TweetReaction, TweetRetweet)
+from .user import user_factory
 
 __all__= [
     "TweetList",
@@ -9,5 +10,7 @@ __all__= [
     "TweetTextOnly",
     "TweetLink",
     "TweetReaction",
-    "TweetRetweet"
+    "TweetRetweet",
+
+    "user_factory"
 ]

+ 40 - 0
twhatter/parser/mixins.py

@@ -0,0 +1,40 @@
+from dataclasses import Field, InitVar
+from typing import Any
+
+from bs4 import BeautifulSoup
+
+
+class ExtractableMixin:
+    """Mixin to extract values from soup"""
+    def __post_init__(self, soup: BeautifulSoup):
+        self.soup = soup
+
+    @classmethod
+    def _extract_value(cls, soup: BeautifulSoup, data_field: Field) -> Any:
+        fn = getattr(cls, "extract_{}".format(data_field.name), None)
+        if not fn:
+            raise NotImplementedError(
+                "Extract function for field '{}' is not "
+                "implemented".format(data_field.name)
+            )
+
+        return fn(soup)
+
+    @staticmethod
+    def _extract_from_div(soup, div_class, data_kw):
+        kw = "data-{}".format(data_kw)
+        return(
+            soup.find('div', class_=div_class, attrs={kw: True})[kw]
+        )
+
+    @staticmethod
+    def _extract_from_span(soup, distinct_span, data_kw):
+        return (
+            soup.find('span', distinct_span)
+                .find('span', attrs={data_kw: True})
+            [data_kw]
+        )
+
+    @staticmethod
+    def extract_soup(soup):
+        return soup

+ 6 - 34
twhatter/parser/tweet.py

@@ -4,9 +4,11 @@ from bs4 import BeautifulSoup
 from dataclasses import dataclass, fields, InitVar, field
 from typing import List
 
+from .mixins import ExtractableMixin
+
 
 @dataclass
-class TweetBase:
+class TweetBase(ExtractableMixin):
     #: Tweet ID
     id: int
     #: Handle of the tweet's original author
@@ -46,9 +48,6 @@ class TweetBase:
     #: The soup extracted from the raw HTML
     soup: InitVar[BeautifulSoup] = None
 
-    def __post_init__(self, soup: BeautifulSoup):
-        self.soup = soup
-
     def __repr__(self):
         return ("<{0} "
                 "(id={1.id}, "
@@ -61,25 +60,10 @@ class TweetBase:
     def condition(kwargs: dict) -> bool:
         raise NotImplementedError()
 
-    @staticmethod
-    def _extract_from_span(soup, distinct_span, data_kw):
-        return (
-            soup.find('span', distinct_span)
-                .find('span', attrs={data_kw: True})
-            [data_kw]
-        )
-
     @classmethod
     def _extract_from_div_tweet(cls, soup, data_kw):
         return cls._extract_from_div(soup, 'tweet', data_kw)
 
-    @staticmethod
-    def _extract_from_div(soup, div_class, data_kw):
-        kw = "data-{}".format(data_kw)
-        return(
-            soup.find('div', class_=div_class, attrs={kw: True})[kw]
-        )
-
     @staticmethod
     def extract_id(soup):
         return int(soup['data-item-id'])
@@ -188,10 +172,6 @@ class TweetBase:
     def extract_text(soup):
         return soup.find('p', 'tweet-text').text
 
-    @staticmethod
-    def extract_soup(soup):
-        return soup
-
 
 class TweetTextOnly(TweetBase):
     """An original tweet with only plain text"""
@@ -223,17 +203,9 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
     :param soup: the soup extracted from the raw html for that tweet
     :return: a well-formatted Tweet
     """
-    def _extract_value(data_field):
-        fn = getattr(TweetBase, "extract_{}".format(data_field.name), None)
-        if not fn:
-            raise NotImplementedError(
-                "Extract function for field '{}' is not "
-                "implemented".format(data_field.name)
-            )
-
-        return fn(soup)
-
-    kwargs = {f.name: _extract_value(f) for f in fields(TweetBase)}
+    kwargs = {
+        f.name: TweetBase._extract_value(soup, f) for f in fields(TweetBase)
+    }
 
     for kls in TweetBase.__subclasses__():
         try:

+ 75 - 0
twhatter/parser/user.py

@@ -0,0 +1,75 @@
+from datetime import datetime
+
+from bs4 import BeautifulSoup
+from dataclasses import dataclass, fields, InitVar, field
+from typing import List
+
+from .mixins import ExtractableMixin
+
+
+@dataclass
+class User(ExtractableMixin):
+    id: int
+    screen_name: str
+    join_date: datetime
+    tweets_nb: int
+    following_nb: int
+    followers_nb: int
+    likes_nb: int
+
+    #: The soup extracted from the raw HTML
+    soup: InitVar[BeautifulSoup] = None
+
+    @staticmethod
+    def _extract_from_li(soup, distinct_span, kw):
+        data_kw = "data-{}".format(kw)
+        return (
+            soup.find('li', distinct_span)
+                .find('span', attrs={data_kw: True})
+            [data_kw]
+        )
+
+    @classmethod
+    def extract_id(cls, soup):
+        return int(cls._extract_from_div(soup, 'ProfileNav', 'user-id'))
+
+    @classmethod
+    def extract_screen_name(cls, soup):
+        return soup.find('a', 'ProfileHeaderCard-nameLink').text
+
+    @classmethod
+    def extract_join_date(cls, soup):
+        kw = 'title'
+        datetime_str = soup.find(
+            'span',
+            class_='ProfileHeaderCard-joinDateText',
+            attrs={kw: True}
+        )[kw]
+
+        # The date is in a weird format (e.g. "7:27 AM - 8 May 2011") and we
+        # don't really care for the exact hour so we only keep the date
+        day_str = datetime_str.split(' - ')[1]
+        return datetime.strptime(day_str, '%d %B %Y')
+
+    @classmethod
+    def extract_tweets_nb(cls, soup):
+        return int(cls._extract_from_li(soup, 'ProfileNav-item--tweets', 'count'))
+
+    @classmethod
+    def extract_following_nb(cls, soup):
+        return int(cls._extract_from_li(soup, 'ProfileNav-item--following', 'count'))
+
+    @classmethod
+    def extract_followers_nb(cls, soup):
+        return int(cls._extract_from_li(soup, 'ProfileNav-item--followers', 'count'))
+
+    @classmethod
+    def extract_likes_nb(cls, soup):
+        return int(cls._extract_from_li(soup, 'ProfileNav-item--favorites', 'count'))
+
+
+def user_factory(soup: BeautifulSoup) -> User:
+    kwargs = {
+        f.name: User._extract_value(soup, f) for f in fields(User)
+    }
+    return User(**kwargs)