7 years ago · 55446a74a8
--- a/twhatter/exploration/node/timeline.py
+++ b/twhatter/exploration/node/timeline.py
@@ -25,6 +25,7 @@ class NodeTimeline(NodeBase):
 
				         tweets = ParserTweet(soup)
			
 
				         self.nb_tweets += len(tweets)
			
 
				         *_, self.earliest_tweet_id = (t.id for t in tweets)
			
 
				+        logger.info("{} tweets retrieved so far".format(self.nb_tweets))
			
 
				 
			
 
				     @classmethod
			
 
				     def get_user_timeline(cls, user_handle):
			
@@ -58,12 +59,13 @@ class NodeTimeline(NodeBase):
 
				         self._update_state(soup)
			
 
				         yield soup
			
 
				 
			
 
				-        while True and self.nb_tweets < self.limit:
			
 
				+        while self.nb_tweets < self.limit:
			
 
				             more_tweets = self.get_more_tweets()
			
 
				             html = json.loads(more_tweets.content)
			
 
				 
			
 
				             soup = BeautifulSoup(html['items_html'], "lxml")
			
 
				             if not soup.text:
			
 
				+                logger.info("Latest request provided no explorable content")
			
 
				                 break
			
 
				 
			
 
				             self._update_state(soup)
			
--- a/twhatter/exploration/strategy/dumb.py
+++ b/twhatter/exploration/strategy/dumb.py
@@ -15,8 +15,9 @@ class StrategyDumb(StrategyBase):
 
				 
			
 
				         objs = []
			
 
				         for s in self.starting_node:
			
 
				+            logger.debug("Got new soup from {}".format(self.starting_node))
			
 
				             for parser in self.parser_classes:
			
 
				-                logger.debug("Parsing new data with {}".format(parser))
			
 
				+                logger.debug("Parsing new soup with {}".format(parser))
			
 
				                 for o in parser(s):
			
 
				                     objs.append(o)
			
 
				         tweets = [t for t in objs if isinstance(t, TweetBase)]
			
--- a/twhatter/parser/tweet.py
+++ b/twhatter/parser/tweet.py
@@ -261,16 +261,32 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 
				 class ParserTweet(ParserBase):
			
 
				     def __init__(self, soup):
			
 
				         super().__init__(soup)
			
 
				-        self.raw_tweets = soup.find_all('li', 'stream-item')
			
 
				+
			
 
				+        # Here 'soup' can be either the full html page as loaded initially,
			
 
				+        # or raw HTML incoming from the XHR requests sent when browsing deeper
			
 
				+        # in the page, so the strategy is adapted to work in both cases.
			
 
				+        # We locate the first tweet ...
			
 
				+        # TODO solve the case when the timeline has a pinned tweet
			
 
				+        self.first = soup.find(
			
 
				+            'li',
			
 
				+            class_='stream-item',
			
 
				+            attrs={'data-item-type': 'tweet'}
			
 
				+        )
			
 
				 
			
 
				     def __iter__(self):
			
 
				-        for tweet in self.raw_tweets:
			
 
				-            # Don't know what this u-dir stuff is about but if it's in there,
			
 
				-            # it's not a tweet !
			
 
				-            if not tweet.find_all('p', class_="u-dir"):
			
 
				-                t = tweet_factory(tweet)
			
 
				-                logger.debug("Parsed tweet {}".format(t))
			
 
				-                yield t
			
 
				+        current = self.first
			
 
				+        while True:
			
 
				+            t = tweet_factory(current)
			
 
				+            logger.debug("Parsed tweet {}".format(t))
			
 
				+            yield t
			
 
				+
			
 
				+            # ... and then we iterate on all the siblings
			
 
				+            # This allows to not fall into the hierarchy, and yield tweets that
			
 
				+            # are not only embedded within other ones (e.g. retweets or
			
 
				+            # reaction tweets)
			
 
				+            current = current.find_next_sibling('li')
			
 
				+            if not current:
			
 
				+                break
			
 
				 
			
 
				     def __len__(self):
			
 
				-        return len(self.raw_tweets)
			
 
				+        return len(self.first.find_next_siblings('li')) + 1