tweet.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from datetime import datetime
  2. from bs4 import BeautifulSoup
  3. from dataclasses import dataclass, fields, field
  4. @dataclass
  5. class Tweet:
  6. #timestamp: datetime
  7. user: str
  8. #fullname: fullname
  9. id: int
  10. #url: url
  11. timestamp: datetime
  12. #text: str
  13. #replies: int
  14. retweets: int
  15. #quoted_tweet: int
  16. likes: int
  17. #html: str
  18. #soup: Any
  19. soup: BeautifulSoup = field(repr=False)
  20. @staticmethod
  21. def extract_id(soup):
  22. return int(soup['data-item-id'])
  23. @staticmethod
  24. def extract_timestamp(soup):
  25. return datetime.utcfromtimestamp(
  26. int(soup.find('span', '_timestamp')['data-time'])
  27. )
  28. @staticmethod
  29. def extract_user(soup):
  30. return soup.find('span', 'username').text or ""
  31. @staticmethod
  32. def extract_fullname(soup):
  33. return soup.find('strong', 'fullname').text or ""
  34. @staticmethod
  35. def extract_retweets(soup):
  36. return int(soup.find(
  37. 'span', 'ProfileTweet-action--retweet u-hiddenVisually')
  38. .find(
  39. 'span', 'ProfileTweet-actionCount'
  40. )['data-tweet-stat-count']
  41. )
  42. @staticmethod
  43. def extract_quoted_tweet(soup):
  44. return int(soup.find(
  45. 'span', 'QuoteTweet-innerContainer').find(
  46. 'span', 'ProfileTweet-actionCount')['data-tweet-stat-count']),
  47. @staticmethod
  48. def extract_soup(soup):
  49. return soup
  50. @classmethod
  51. def extract(cls, soup):
  52. def _extract_value(field):
  53. fn = getattr(cls, "extract_{}".format(field.name), None)
  54. if not fn:
  55. raise NotImplementedError(
  56. "Extract function for field '{}' is not "
  57. "implemented".format(field.name)
  58. )
  59. return fn(soup)
  60. kwargs = {f.name: _extract_value(f) for f in fields(cls)}
  61. return cls(**kwargs)
  62. class TweetList:
  63. def __init__(self, soup):
  64. self.raw_tweets = soup.find_all('li', 'stream-item')
  65. def __iter__(self):
  66. for tweet in self.raw_tweets:
  67. yield Tweet.extract(tweet)