Explorar el Código

Update tweet parser to yield tweets at the same depth in the DOM

theenglishway (time) hace 7 años
padre
commit
55446a74a8

+ 3 - 1
twhatter/exploration/node/timeline.py

@@ -25,6 +25,7 @@ class NodeTimeline(NodeBase):
         tweets = ParserTweet(soup)
         self.nb_tweets += len(tweets)
         *_, self.earliest_tweet_id = (t.id for t in tweets)
+        logger.info("{} tweets retrieved so far".format(self.nb_tweets))
 
     @classmethod
     def get_user_timeline(cls, user_handle):
@@ -58,12 +59,13 @@ class NodeTimeline(NodeBase):
         self._update_state(soup)
         yield soup
 
-        while True and self.nb_tweets < self.limit:
+        while self.nb_tweets < self.limit:
             more_tweets = self.get_more_tweets()
             html = json.loads(more_tweets.content)
 
             soup = BeautifulSoup(html['items_html'], "lxml")
             if not soup.text:
+                logger.info("Latest request provided no explorable content")
                 break
 
             self._update_state(soup)

+ 2 - 1
twhatter/exploration/strategy/dumb.py

@@ -15,8 +15,9 @@ class StrategyDumb(StrategyBase):
 
         objs = []
         for s in self.starting_node:
+            logger.debug("Got new soup from {}".format(self.starting_node))
             for parser in self.parser_classes:
-                logger.debug("Parsing new data with {}".format(parser))
+                logger.debug("Parsing new soup with {}".format(parser))
                 for o in parser(s):
                     objs.append(o)
         tweets = [t for t in objs if isinstance(t, TweetBase)]

+ 25 - 9
twhatter/parser/tweet.py

@@ -261,16 +261,32 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 class ParserTweet(ParserBase):
     def __init__(self, soup):
         super().__init__(soup)
-        self.raw_tweets = soup.find_all('li', 'stream-item')
+
+        # Here 'soup' can be either the full html page as loaded initially,
+        # or raw HTML incoming from the XHR requests sent when browsing deeper
+        # in the page, so the strategy is adapted to work in both cases.
+        # We locate the first tweet ...
+        # TODO solve the case when the timeline has a pinned tweet
+        self.first = soup.find(
+            'li',
+            class_='stream-item',
+            attrs={'data-item-type': 'tweet'}
+        )
 
     def __iter__(self):
-        for tweet in self.raw_tweets:
-            # Don't know what this u-dir stuff is about but if it's in there,
-            # it's not a tweet !
-            if not tweet.find_all('p', class_="u-dir"):
-                t = tweet_factory(tweet)
-                logger.debug("Parsed tweet {}".format(t))
-                yield t
+        current = self.first
+        while True:
+            t = tweet_factory(current)
+            logger.debug("Parsed tweet {}".format(t))
+            yield t
+
+            # ... and then we iterate on all the siblings
+            # This allows to not fall into the hierarchy, and yield tweets that
+            # are not only embedded within other ones (e.g. retweets or
+            # reaction tweets)
+            current = current.find_next_sibling('li')
+            if not current:
+                break
 
     def __len__(self):
-        return len(self.raw_tweets)
+        return len(self.first.find_next_siblings('li')) + 1