tweet.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. from datetime import datetime
  2. from bs4 import BeautifulSoup
  3. from dataclasses import dataclass, fields, InitVar, field
  4. @dataclass
  5. class Tweet:
  6. id: int
  7. timestamp: datetime
  8. user: str
  9. replies: int
  10. retweets: int
  11. likes: int
  12. text: str = field(repr=False)
  13. soup: InitVar[BeautifulSoup] = None
  14. def __post_init__(self, soup):
  15. self.soup = soup
  16. @staticmethod
  17. def _extract_data(soup, distinct_span, data_kw):
  18. return (
  19. soup.find('span', distinct_span)
  20. .find('span', attrs={data_kw: True})
  21. [data_kw]
  22. )
  23. @staticmethod
  24. def extract_id(soup):
  25. return int(soup['data-item-id'])
  26. @staticmethod
  27. def extract_user(soup):
  28. return soup.find('span', 'username').text
  29. @staticmethod
  30. def extract_timestamp(soup):
  31. return datetime.utcfromtimestamp(
  32. int(soup.find('span', '_timestamp')['data-time'])
  33. )
  34. @staticmethod
  35. def extract_fullname(soup):
  36. return soup.find('strong', 'fullname').text
  37. @classmethod
  38. def extract_retweets(cls, soup):
  39. return cls._extract_data(
  40. soup,
  41. 'ProfileTweet-action--retweet',
  42. 'data-tweet-stat-count'
  43. )
  44. @classmethod
  45. def extract_replies(cls, soup):
  46. return cls._extract_data(
  47. soup,
  48. 'ProfileTweet-action--reply',
  49. 'data-tweet-stat-count'
  50. )
  51. @classmethod
  52. def extract_likes(cls, soup):
  53. return cls._extract_data(
  54. soup,
  55. 'ProfileTweet-action--favorite',
  56. 'data-tweet-stat-count'
  57. )
  58. @staticmethod
  59. def extract_text(soup):
  60. return soup.find('p', 'tweet-text').text
  61. @staticmethod
  62. def extract_quoted_tweet(soup):
  63. return int(soup.find(
  64. 'span', 'QuoteTweet-innerContainer').find(
  65. 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count']),
  66. @staticmethod
  67. def extract_soup(soup):
  68. return soup
  69. @classmethod
  70. def extract(cls, soup):
  71. def _extract_value(field):
  72. fn = getattr(cls, "extract_{}".format(field.name), None)
  73. if not fn:
  74. raise NotImplementedError(
  75. "Extract function for field '{}' is not "
  76. "implemented".format(field.name)
  77. )
  78. return fn(soup)
  79. kwargs = {f.name: _extract_value(f) for f in fields(cls)}
  80. return cls(soup=soup, **kwargs)
  81. class TweetList:
  82. def __init__(self, soup):
  83. self.raw_tweets = soup.find_all('li', 'stream-item')
  84. def __iter__(self):
  85. for tweet in self.raw_tweets:
  86. yield Tweet.extract(tweet)
  87. def __len__(self):
  88. return len(self.raw_tweets)