7 роки тому · 63e4e4b600
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,6 +2,11 @@
 
				 History
			
 
				 =======
			
 
				 
			
 
				+0.2.0 (2019-01-23)
			
 
				+------------------
			
 
				+
			
 
				+* First proper release.
			
 
				+
			
 
				 0.1.0 (2019-01-09)
			
 
				 ------------------
			
 
				 
			
--- a/README.rst
+++ b/README.rst
@@ -15,8 +15,8 @@ anonymous browsing session.
 
				 
			
 
				 This is mostly an attempt for me to produce some clean, functional and
			
 
				 maintainable Python code. I have especially focused on a clean separation
			
 
				-between data retrieval and output, which should allow to export data in any
			
 
				-format.
			
 
				+between data retrieval, Twitter pages exploration, and output, which allows
			
 
				+to easily define and combine various crawling strategies and data formats.
			
 
				 
			
 
				 And why that terrible name ? Simple, "WHAT's going on TWITTER ?" => TWHATTER !
			
 
				 
			
@@ -34,8 +34,11 @@ Anonymous client
 
				 Data output
			
 
				 ***********
			
 
				 
			
 
				-All scraped information can either be displayed on the terminal or stored into
			
 
				-a local database.
			
 
				+All scraped information can either be :
			
 
				+
			
 
				+* displayed on the terminal,
			
 
				+* stored into a JSON / YAML file
			
 
				+* stored into a local database.
			
 
				 
			
 
				 Installation
			
 
				 ------------
			
@@ -44,7 +47,7 @@ Installation requires Python >= 3.6. ::
 
				 
			
 
				     $ pip install --user git+https://code.theenglishway.eu/theenglishway-corp/twhatter
			
 
				 
			
 
				-You then have to ensure that `~/.local/bin` in your `$PATH` or call
			
 
				+You then have to ensure that `~/.local/bin` is in your `$PATH` or call
			
 
				 `~/.local/bin/twhatter` instead of `twhatter` in the following examples
			
 
				 
			
 
				 Usage
			
@@ -52,12 +55,13 @@ Usage
 
				 
			
 
				 Display some user's tweets ::
			
 
				 
			
 
				-    $ twhatter timeline realDonaldTrump --limit 10
			
 
				+    $ twhatter timeline realDonaldTrump --limit 40
			
 
				     <TweetTextOnly (id=1083404900862545920, date=2019-01-10 16:47:11, likes=32033, likes=11087, likes=6935)>
			
 
				     <TweetTextOnly (id=1083358775925460992, date=2019-01-10 13:43:54, likes=96565, likes=22596, likes=26802)>
			
 
				     <TweetTextOnly (id=1083358611315789826, date=2019-01-10 13:43:15, likes=52849, likes=9344, likes=9571)>
			
 
				     <TweetTextOnly (id=1083358150214979585, date=2019-01-10 13:41:25, likes=48808, likes=11096, likes=11499)>
			
 
				     <TweetTextOnly (id=1083356326833602561, date=2019-01-10 13:34:10, likes=50695, likes=11743, likes=11045)>
			
 
				+    ...
			
 
				     <TweetTextOnly (id=1083353895030702080, date=2019-01-10 13:24:30, likes=85184, likes=19686, likes=27751)>
			
 
				     <TweetRetweet (id=1083121283645272064, date=2019-01-09 22:00:12, likes=42640, likes=13189, likes=10242)>
			
 
				     <TweetRetweet (id=1082774275390693376, date=2019-01-08 23:01:18, likes=52776, likes=14459, likes=2403)>
			
@@ -69,6 +73,11 @@ Display their profile information ::
 
				     $ twhatter profile realDonaldTrump
			
 
				     User(id=25073877, fullname='Donald J. Trump', join_date=datetime.datetime(2009, 3, 18, 0, 0), tweets_nb=40183, following_nb=45, followers_nb=57144827, likes_nb=7)
			
 
				 
			
 
				+Put them into a JSON/YAML file ::
			
 
				+
			
 
				+    $ twhatter json timeline realDonaldTrump
			
 
				+    $ twhatter yaml profile realDonaldTrump
			
 
				+
			
 
				 Put them into a local database (by default in /tmp/db.sqlite) ::
			
 
				 
			
 
				     $ twhatter db timeline realDonaldTrump
			
@@ -86,6 +95,11 @@ Open a session on the local database and make queries with SQLAlchemy ::
 
				      <Tweet (id=1026482814164844544),
			
 
				      <Tweet (id=1027797734613504001)]
			
 
				 
			
 
				+In all cases the help is here ::
			
 
				+
			
 
				+    $ twhatter --help
			
 
				+
			
 
				+
			
 
				 Tests
			
 
				 -----
			
 
				 
			
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -148,7 +148,7 @@ def tweet_collection():
 
				 def raw_html_user_initial_page_factory():
			
 
				     def _raw_html_user_initial_page(user):
			
 
				         n = NodeTimeline(user)
			
 
				-        response = n.get_user_timeline(user)
			
 
				+        response = n._get_base_page(user)
			
 
				         return BeautifulSoup(response.text, "lxml")
			
 
				     return _raw_html_user_initial_page
			
 
				 
			
--- a/twhatter/cli.py
+++ b/twhatter/cli.py
@@ -18,6 +18,7 @@ from twhatter.parser import ParserTweet, ParserUser
 
				               default='info', show_default=True)
			
 
				 @click.pass_context
			
 
				 def main(ctx, verbosity):
			
 
				+    """Output various information from Twitter"""
			
 
				     log_setup(verbosity)
			
 
				     ctx.ensure_object(dict)
			
 
				     ctx.obj['output'] = Print()
			
@@ -27,6 +28,7 @@ def main(ctx, verbosity):
 
				 @click.option('-d', '--db_url', type=str, default="sqlite:////tmp/db.sqlite3", show_default=True)
			
 
				 @click.pass_context
			
 
				 def db(ctx, db_url):
			
 
				+    """Output information into a database"""
			
 
				     ctx.obj['output'] = Database(db_url)
			
 
				 
			
 
				 
			
@@ -34,6 +36,7 @@ def db(ctx, db_url):
 
				 @click.option('-f', '--json_file', type=str, default="/tmp/output.json", show_default=True)
			
 
				 @click.pass_context
			
 
				 def json(ctx, json_file):
			
 
				+    """Output information into a JSON file"""
			
 
				     ctx.obj['output'] = Json(json_file)
			
 
				 
			
 
				 
			
@@ -41,6 +44,7 @@ def json(ctx, json_file):
 
				 @click.option('-f', '--yaml_file', type=str, default="/tmp/output.yaml", show_default=True)
			
 
				 @click.pass_context
			
 
				 def yaml(ctx, yaml_file):
			
 
				+    """Output information into a YAML file"""
			
 
				     ctx.obj['output'] = Yaml(yaml_file)
			
 
				 
			
 
				 
			
@@ -58,7 +62,7 @@ def timeline(ctx, limit, user):
 
				 @click.argument('user')
			
 
				 @click.pass_context
			
 
				 def profile(ctx, user):
			
 
				-    """Get basic info about some user"""
			
 
				+    """Get a user's profile information"""
			
 
				     start_node = NodeTimeline(user, limit=1)
			
 
				     strategy = StrategyDumb(start_node, ParserUser)
			
 
				     strategy(ctx.obj['output'])
			
@@ -67,6 +71,7 @@ def profile(ctx, user):
 
				 @db.command()
			
 
				 @click.pass_context
			
 
				 def shell(ctx):
			
 
				+    """Launch an IPython session to interact with the database"""
			
 
				     session = ctx.obj['output'].start()
			
 
				     user_ns = {
			
 
				         'db': ctx.obj['output'],
			
--- a/twhatter/exploration/node/base.py
+++ b/twhatter/exploration/node/base.py
@@ -4,8 +4,9 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 class NodeBase:
			
 
				     """Base class for nodes, which are all the pages that Twitter allows us
			
 
				-    to visit. They can be iterated on, and will yield 'soup' within the limits
			
 
				-    defined at initialization"""
			
 
				+    to visit with a simple HTTP client.
			
 
				+    They behave as generators, and yield `PageElement` data as processed
			
 
				+    by the `BeautifulSoup` library."""
			
 
				     def __init__(self):
			
 
				         logger.debug("Initializing {}".format(self.__class__.__qualname__))
			
 
				 
			
--- a/twhatter/exploration/node/timeline.py
+++ b/twhatter/exploration/node/timeline.py
@@ -12,6 +12,9 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class NodeTimeline(NodeBase):
			
 
				+    """Implementation of the "timeline" node, which is the page accessed by
			
 
				+    https://twitter.com/the_user_name, that can be scrolled until the beginning
			
 
				+    of times."""
			
 
				     user_agent = generate_user_agent(os='linux')
			
 
				 
			
 
				     def __init__(self, user, limit=100):
			
@@ -28,8 +31,8 @@ class NodeTimeline(NodeBase):
 
				         logger.info("{} tweets retrieved so far".format(self.nb_tweets))
			
 
				 
			
 
				     @classmethod
			
 
				-    def get_user_timeline(cls, user_handle):
			
 
				-        logger.info("Loading initial timeline for {}".format(user_handle))
			
 
				+    def _get_base_page(cls, user_handle):
			
 
				+        logger.info("Loading base page for {}'s timeline".format(user_handle))
			
 
				         url = "https://twitter.com/{}".format(user_handle)
			
 
				         return requests.get(
			
 
				             url,
			
@@ -39,8 +42,8 @@ class NodeTimeline(NodeBase):
 
				             }
			
 
				         )
			
 
				 
			
 
				-    def get_more_tweets(self):
			
 
				-        logger.info("Loading more tweets from {}".format(self.user))
			
 
				+    def _scroll(self):
			
 
				+        logger.info("Scrolling in {}'s timeline".format(self.user))
			
 
				         return requests.get(
			
 
				             "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
			
 
				             params= dict(
			
@@ -54,14 +57,14 @@ class NodeTimeline(NodeBase):
 
				 
			
 
				     def __iter__(self):
			
 
				         super().__iter__()
			
 
				-        tweets = self.get_user_timeline(self.user)
			
 
				-        soup = BeautifulSoup(tweets.text, "lxml")
			
 
				+        base = self._get_base_page(self.user)
			
 
				+        soup = BeautifulSoup(base.text, "lxml")
			
 
				         self._update_state(soup)
			
 
				         yield soup
			
 
				 
			
 
				         while self.nb_tweets < self.limit:
			
 
				-            more_tweets = self.get_more_tweets()
			
 
				-            html = json.loads(more_tweets.content)
			
 
				+            more = self._scroll()
			
 
				+            html = json.loads(more.content)
			
 
				 
			
 
				             soup = BeautifulSoup(html['items_html'], "lxml")
			
 
				             if not soup.text:
			
--- a/twhatter/exploration/strategy/base.py
+++ b/twhatter/exploration/strategy/base.py
@@ -9,18 +9,16 @@ class StrategyBase:
 
				     parser_classes = []
			
 
				 
			
 
				     def __init__(self, starting_node: 'NodeBase', *parser_classes: 'ParserBase') -> None:
			
 
				-        logger.debug(
			
 
				-            "Initializing {} with starting_node={} and parser_classes={}".format(
			
 
				-                self.__class__.__qualname__,
			
 
				-                starting_node,
			
 
				-                parser_classes
			
 
				-            )
			
 
				-        )
			
 
				+        """
			
 
				+        :param starting_node: the node from which exploration starts
			
 
				+        :param parser_classes: parsers that should be applied on each iteration
			
 
				+        of the node
			
 
				+        """
			
 
				         self.starting_node = starting_node
			
 
				         self.parser_classes = parser_classes
			
 
				 
			
 
				     def __call__(self, output) -> None:
			
 
				-        logger.debug("Applying {}".format(self.__class__.__qualname__))
			
 
				+        logger.debug("Applying {}".format(self))
			
 
				 
			
 
				     def __repr__(self):
			
 
				         return "<{} (starting_node={}, parsers={})>".format(
			
--- a/twhatter/exploration/strategy/dumb.py
+++ b/twhatter/exploration/strategy/dumb.py
@@ -8,7 +8,8 @@ logger = logging.getLogger(__name__)
 
				 
			
 
				 
			
 
				 class StrategyDumb(StrategyBase):
			
 
				-    """This strategy only explores the initial node"""
			
 
				+    """This strategy only explores the initial node and scrolls through it
			
 
				+    until exhaustion"""
			
 
				     def __call__(self, output):
			
 
				         super().__call__(output)
			
 
				         output.start()
			
@@ -20,6 +21,7 @@ class StrategyDumb(StrategyBase):
 
				                 logger.debug("Parsing new soup with {}".format(parser))
			
 
				                 for o in parser(s):
			
 
				                     objs.append(o)
			
 
				+
			
 
				         tweets = [t for t in objs if isinstance(t, TweetBase)]
			
 
				         output.output_tweets(tweets)
			
 
				         users = [u for u in objs if isinstance(u, User)]
			
--- a/twhatter/parser/base.py
+++ b/twhatter/parser/base.py
@@ -2,10 +2,11 @@ from typing import Any
 
				 
			
 
				 
			
 
				 class ParserBase:
			
 
				-    """Base class for a parser, an iterator that yield all elements of a certain
			
 
				-    type within a given page"""
			
 
				+    """Base class for a parser
			
 
				+    A parser behaves like a generators, and yield all elements of a certain
			
 
				+    type within the original page"""
			
 
				     def __init__(self, soup: 'PageElement') -> None:
			
 
				-        pass
			
 
				+        raise NotImplementedError()
			
 
				 
			
 
				     def __iter__(self) -> Any:
			
 
				         raise NotImplementedError()
			
--- a/twhatter/parser/media.py
+++ b/twhatter/parser/media.py
@@ -64,7 +64,6 @@ def media_factory(soup: BeautifulSoup) -> Optional[MediaBase]:
 
				 
			
 
				 class ParserMedia(ParserBase):
			
 
				     def __init__(self, soup):
			
 
				-        super().__init__(soup)
			
 
				         self.soup = soup
			
 
				 
			
 
				     def __iter__(self):
			
--- a/twhatter/parser/tweet.py
+++ b/twhatter/parser/tweet.py
@@ -257,8 +257,6 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 
				 
			
 
				 class ParserTweet(ParserBase):
			
 
				     def __init__(self, soup):
			
 
				-        super().__init__(soup)
			
 
				-
			
 
				         # Here 'soup' can be either the full html page as loaded initially,
			
 
				         # or raw HTML incoming from the XHR requests sent when browsing deeper
			
 
				         # in the page, so the strategy is adapted to work in both cases.
			
--- a/twhatter/parser/user.py
+++ b/twhatter/parser/user.py
@@ -91,7 +91,6 @@ class User(ExtractableMixin):
 
				 
			
 
				 class ParserUser(ParserBase):
			
 
				     def __init__(self, soup):
			
 
				-        super().__init__(soup)
			
 
				         self.soup = soup
			
 
				 
			
 
				     def __iter__(self):