7 yıl önce · 4e15ebe198
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -146,3 +146,29 @@ def tweet_test_data_factory(raw_tweet_factory, tweet_collection):
 
				         return tweet_factory(raw_tweet), tweet_info
			
 
				 
			
 
				     return _tweet_test_data_factory
			
 
				+
			
 
				+
			
 
				+class UserInfo(NamedTuple):
			
 
				+    """Class to hold information about an user that is already known"""
			
 
				+    id: int
			
 
				+    screen_name: str
			
 
				+    join_date: datetime
			
 
				+    tweets_nb: int
			
 
				+    following_nb: int
			
 
				+    followers_nb: int
			
 
				+    likes_nb: int
			
 
				+
			
 
				+
			
 
				+@pytest.fixture(scope="session")
			
 
				+def user_collection():
			
 
				+    return {
			
 
				+        'Marlene_beadles': UserInfo(
			
 
				+            id=295177446,
			
 
				+            screen_name="Marlene Hansen",
			
 
				+            join_date=datetime(2011, 5, 8, 0, 0),
			
 
				+            tweets_nb=25,
			
 
				+            following_nb=344,
			
 
				+            followers_nb=81,
			
 
				+            likes_nb=4
			
 
				+        ),
			
 
				+    }
			
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -45,3 +45,18 @@ class TestTweet:
 
				     def test_tweet_type(self, tweet_test_data_factory, tweet_type, expected_class):
			
 
				         t, tweet_info = tweet_test_data_factory(tweet_type)
			
 
				         assert isinstance(t, expected_class)
			
 
				+
			
 
				+
			
 
				+class TestUser:
			
 
				+    all_handles = [
			
 
				+        "Marlene_beadles"
			
 
				+    ]
			
 
				+
			
 
				+    @pytest.mark.parametrize("user_handle", all_handles)
			
 
				+    def test_user(self, raw_html_user_initial_page_factory, user_collection, user_handle):
			
 
				+        user_info = user_collection[user_handle]
			
 
				+        raw_user = raw_html_user_initial_page_factory(user_handle)
			
 
				+        user = user_factory(raw_user)
			
 
				+
			
 
				+        for field, value in user_info._asdict().items():
			
 
				+            assert getattr(user, field) == value
			
--- a/twhatter/api.py
+++ b/twhatter/api.py
@@ -32,7 +32,7 @@ class ApiUser(Api):
 
				         url = "https://twitter.com/{}".format(self.user)
			
 
				         return requests.get(
			
 
				             url,
			
 
				-            headers={'User-Agent': choice(self.HEADERS_LIST)}
			
 
				+            headers={'User-Agent': choice(self.HEADERS_LIST), 'Accept-Language': 'en'}
			
 
				         )
			
 
				 
			
 
				     def get_more_tweets(self):
			
--- a/twhatter/parser/__init__.py
+++ b/twhatter/parser/__init__.py
@@ -1,6 +1,7 @@
 
				 from .tweet import (TweetList, TweetBase,
			
 
				                     tweet_factory,
			
 
				                     TweetTextOnly, TweetLink, TweetReaction, TweetRetweet)
			
 
				+from .user import user_factory
			
 
				 
			
 
				 __all__= [
			
 
				     "TweetList",
			
@@ -9,5 +10,7 @@ __all__= [
 
				     "TweetTextOnly",
			
 
				     "TweetLink",
			
 
				     "TweetReaction",
			
 
				-    "TweetRetweet"
			
 
				+    "TweetRetweet",
			
 
				+
			
 
				+    "user_factory"
			
 
				 ]
			
--- a/twhatter/parser/mixins.py
+++ b/twhatter/parser/mixins.py
@@ -0,0 +1,40 @@
 
				+from dataclasses import Field, InitVar
			
 
				+from typing import Any
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+
			
 
				+class ExtractableMixin:
			
 
				+    """Mixin to extract values from soup"""
			
 
				+    def __post_init__(self, soup: BeautifulSoup):
			
 
				+        self.soup = soup
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _extract_value(cls, soup: BeautifulSoup, data_field: Field) -> Any:
			
 
				+        fn = getattr(cls, "extract_{}".format(data_field.name), None)
			
 
				+        if not fn:
			
 
				+            raise NotImplementedError(
			
 
				+                "Extract function for field '{}' is not "
			
 
				+                "implemented".format(data_field.name)
			
 
				+            )
			
 
				+
			
 
				+        return fn(soup)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _extract_from_div(soup, div_class, data_kw):
			
 
				+        kw = "data-{}".format(data_kw)
			
 
				+        return(
			
 
				+            soup.find('div', class_=div_class, attrs={kw: True})[kw]
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _extract_from_span(soup, distinct_span, data_kw):
			
 
				+        return (
			
 
				+            soup.find('span', distinct_span)
			
 
				+                .find('span', attrs={data_kw: True})
			
 
				+            [data_kw]
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def extract_soup(soup):
			
 
				+        return soup
			
--- a/twhatter/parser/tweet.py
+++ b/twhatter/parser/tweet.py
@@ -4,9 +4,11 @@ from bs4 import BeautifulSoup
 
				 from dataclasses import dataclass, fields, InitVar, field
			
 
				 from typing import List
			
 
				 
			
 
				+from .mixins import ExtractableMixin
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				-class TweetBase:
			
 
				+class TweetBase(ExtractableMixin):
			
 
				     #: Tweet ID
			
 
				     id: int
			
 
				     #: Handle of the tweet's original author
			
@@ -46,9 +48,6 @@ class TweetBase:
 
				     #: The soup extracted from the raw HTML
			
 
				     soup: InitVar[BeautifulSoup] = None
			
 
				 
			
 
				-    def __post_init__(self, soup: BeautifulSoup):
			
 
				-        self.soup = soup
			
 
				-
			
 
				     def __repr__(self):
			
 
				         return ("<{0} "
			
 
				                 "(id={1.id}, "
			
@@ -61,25 +60,10 @@ class TweetBase:
 
				     def condition(kwargs: dict) -> bool:
			
 
				         raise NotImplementedError()
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def _extract_from_span(soup, distinct_span, data_kw):
			
 
				-        return (
			
 
				-            soup.find('span', distinct_span)
			
 
				-                .find('span', attrs={data_kw: True})
			
 
				-            [data_kw]
			
 
				-        )
			
 
				-
			
 
				     @classmethod
			
 
				     def _extract_from_div_tweet(cls, soup, data_kw):
			
 
				         return cls._extract_from_div(soup, 'tweet', data_kw)
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def _extract_from_div(soup, div_class, data_kw):
			
 
				-        kw = "data-{}".format(data_kw)
			
 
				-        return(
			
 
				-            soup.find('div', class_=div_class, attrs={kw: True})[kw]
			
 
				-        )
			
 
				-
			
 
				     @staticmethod
			
 
				     def extract_id(soup):
			
 
				         return int(soup['data-item-id'])
			
@@ -188,10 +172,6 @@ class TweetBase:
 
				     def extract_text(soup):
			
 
				         return soup.find('p', 'tweet-text').text
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def extract_soup(soup):
			
 
				-        return soup
			
 
				-
			
 
				 
			
 
				 class TweetTextOnly(TweetBase):
			
 
				     """An original tweet with only plain text"""
			
@@ -223,17 +203,9 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 
				     :param soup: the soup extracted from the raw html for that tweet
			
 
				     :return: a well-formatted Tweet
			
 
				     """
			
 
				-    def _extract_value(data_field):
			
 
				-        fn = getattr(TweetBase, "extract_{}".format(data_field.name), None)
			
 
				-        if not fn:
			
 
				-            raise NotImplementedError(
			
 
				-                "Extract function for field '{}' is not "
			
 
				-                "implemented".format(data_field.name)
			
 
				-            )
			
 
				-
			
 
				-        return fn(soup)
			
 
				-
			
 
				-    kwargs = {f.name: _extract_value(f) for f in fields(TweetBase)}
			
 
				+    kwargs = {
			
 
				+        f.name: TweetBase._extract_value(soup, f) for f in fields(TweetBase)
			
 
				+    }
			
 
				 
			
 
				     for kls in TweetBase.__subclasses__():
			
 
				         try:
			
--- a/twhatter/parser/user.py
+++ b/twhatter/parser/user.py
@@ -0,0 +1,75 @@
 
				+from datetime import datetime
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+from dataclasses import dataclass, fields, InitVar, field
			
 
				+from typing import List
			
 
				+
			
 
				+from .mixins import ExtractableMixin
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class User(ExtractableMixin):
			
 
				+    id: int
			
 
				+    screen_name: str
			
 
				+    join_date: datetime
			
 
				+    tweets_nb: int
			
 
				+    following_nb: int
			
 
				+    followers_nb: int
			
 
				+    likes_nb: int
			
 
				+
			
 
				+    #: The soup extracted from the raw HTML
			
 
				+    soup: InitVar[BeautifulSoup] = None
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _extract_from_li(soup, distinct_span, kw):
			
 
				+        data_kw = "data-{}".format(kw)
			
 
				+        return (
			
 
				+            soup.find('li', distinct_span)
			
 
				+                .find('span', attrs={data_kw: True})
			
 
				+            [data_kw]
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_id(cls, soup):
			
 
				+        return int(cls._extract_from_div(soup, 'ProfileNav', 'user-id'))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_screen_name(cls, soup):
			
 
				+        return soup.find('a', 'ProfileHeaderCard-nameLink').text
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_join_date(cls, soup):
			
 
				+        kw = 'title'
			
 
				+        datetime_str = soup.find(
			
 
				+            'span',
			
 
				+            class_='ProfileHeaderCard-joinDateText',
			
 
				+            attrs={kw: True}
			
 
				+        )[kw]
			
 
				+
			
 
				+        # The date is in a weird format (e.g. "7:27 AM - 8 May 2011") and we
			
 
				+        # don't really care for the exact hour so we only keep the date
			
 
				+        day_str = datetime_str.split(' - ')[1]
			
 
				+        return datetime.strptime(day_str, '%d %B %Y')
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_tweets_nb(cls, soup):
			
 
				+        return int(cls._extract_from_li(soup, 'ProfileNav-item--tweets', 'count'))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_following_nb(cls, soup):
			
 
				+        return int(cls._extract_from_li(soup, 'ProfileNav-item--following', 'count'))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_followers_nb(cls, soup):
			
 
				+        return int(cls._extract_from_li(soup, 'ProfileNav-item--followers', 'count'))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def extract_likes_nb(cls, soup):
			
 
				+        return int(cls._extract_from_li(soup, 'ProfileNav-item--favorites', 'count'))
			
 
				+
			
 
				+
			
 
				+def user_factory(soup: BeautifulSoup) -> User:
			
 
				+    kwargs = {
			
 
				+        f.name: User._extract_value(soup, f) for f in fields(User)
			
 
				+    }
			
 
				+    return User(**kwargs)