7 лет назад · 63e4e4b600
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,6 +2,11 @@
 
																 History
															
 
																 =======
															
 
																+0.2.0 (2019-01-23)
															
 
																+------------------
															
 
																+
															
 
																+* First proper release.
															
 
																+
															
 
																 0.1.0 (2019-01-09)
															
 
																 ------------------
															
--- a/README.rst
+++ b/README.rst
@@ -15,8 +15,8 @@ anonymous browsing session.
 
																 This is mostly an attempt for me to produce some clean, functional and
															
 
																 maintainable Python code. I have especially focused on a clean separation
															
 
																-between data retrieval and output, which should allow to export data in any
															
 
																-format.
															
 
																+between data retrieval, Twitter pages exploration, and output, which allows
															
 
																+to easily define and combine various crawling strategies and data formats.
															
 
																 And why that terrible name ? Simple, "WHAT's going on TWITTER ?" => TWHATTER !
															
@@ -34,8 +34,11 @@ Anonymous client
 
																 Data output
															
 
																 ***********
															
 
																-All scraped information can either be displayed on the terminal or stored into
															
 
																-a local database.
															
 
																+All scraped information can either be :
															
 
																+
															
 
																+* displayed on the terminal,
															
 
																+* stored into a JSON / YAML file
															
 
																+* stored into a local database.
															
 
																 Installation
															
 
																 ------------
															
@@ -44,7 +47,7 @@ Installation requires Python >= 3.6. ::
 
																     $ pip install --user git+https://code.theenglishway.eu/theenglishway-corp/twhatter
															
 
																-You then have to ensure that `~/.local/bin` in your `$PATH` or call
															
 
																+You then have to ensure that `~/.local/bin` is in your `$PATH` or call
															
 
																 `~/.local/bin/twhatter` instead of `twhatter` in the following examples
															
 
																 Usage
															
@@ -52,12 +55,13 @@ Usage
 
																 Display some user's tweets ::
															
 
																-    $ twhatter timeline realDonaldTrump --limit 10
															
 
																+    $ twhatter timeline realDonaldTrump --limit 40
															
 
																     <TweetTextOnly (id=1083404900862545920, date=2019-01-10 16:47:11, likes=32033, likes=11087, likes=6935)>
															
 
																     <TweetTextOnly (id=1083358775925460992, date=2019-01-10 13:43:54, likes=96565, likes=22596, likes=26802)>
															
 
																     <TweetTextOnly (id=1083358611315789826, date=2019-01-10 13:43:15, likes=52849, likes=9344, likes=9571)>
															
 
																     <TweetTextOnly (id=1083358150214979585, date=2019-01-10 13:41:25, likes=48808, likes=11096, likes=11499)>
															
 
																     <TweetTextOnly (id=1083356326833602561, date=2019-01-10 13:34:10, likes=50695, likes=11743, likes=11045)>
															
 
																+    ...
															
 
																     <TweetTextOnly (id=1083353895030702080, date=2019-01-10 13:24:30, likes=85184, likes=19686, likes=27751)>
															
 
																     <TweetRetweet (id=1083121283645272064, date=2019-01-09 22:00:12, likes=42640, likes=13189, likes=10242)>
															
 
																     <TweetRetweet (id=1082774275390693376, date=2019-01-08 23:01:18, likes=52776, likes=14459, likes=2403)>
															
@@ -69,6 +73,11 @@ Display their profile information ::
 
																     $ twhatter profile realDonaldTrump
															
 
																     User(id=25073877, fullname='Donald J. Trump', join_date=datetime.datetime(2009, 3, 18, 0, 0), tweets_nb=40183, following_nb=45, followers_nb=57144827, likes_nb=7)
															
 
																+Put them into a JSON/YAML file ::
															
 
																+
															
 
																+    $ twhatter json timeline realDonaldTrump
															
 
																+    $ twhatter yaml profile realDonaldTrump
															
 
																+
															
 
																 Put them into a local database (by default in /tmp/db.sqlite) ::
															
 
																     $ twhatter db timeline realDonaldTrump
															
@@ -86,6 +95,11 @@ Open a session on the local database and make queries with SQLAlchemy ::
 
																      <Tweet (id=1026482814164844544),
															
 
																      <Tweet (id=1027797734613504001)]
															
 
																+In all cases the help is here ::
															
 
																+
															
 
																+    $ twhatter --help
															
 
																+
															
 
																+
															
 
																 Tests
															
 
																 -----
															
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -148,7 +148,7 @@ def tweet_collection():
 
																 def raw_html_user_initial_page_factory():
															
 
																     def _raw_html_user_initial_page(user):
															
 
																         n = NodeTimeline(user)
															
 
																-        response = n.get_user_timeline(user)
															
 
																+        response = n._get_base_page(user)
															
 
																         return BeautifulSoup(response.text, "lxml")
															
 
																     return _raw_html_user_initial_page
															
--- a/twhatter/cli.py
+++ b/twhatter/cli.py
@@ -18,6 +18,7 @@ from twhatter.parser import ParserTweet, ParserUser
 
																               default='info', show_default=True)
															
 
																 @click.pass_context
															
 
																 def main(ctx, verbosity):
															
 
																+    """Output various information from Twitter"""
															
 
																     log_setup(verbosity)
															
 
																     ctx.ensure_object(dict)
															
 
																     ctx.obj['output'] = Print()
															
@@ -27,6 +28,7 @@ def main(ctx, verbosity):
 
																 @click.option('-d', '--db_url', type=str, default="sqlite:////tmp/db.sqlite3", show_default=True)
															
 
																 @click.pass_context
															
 
																 def db(ctx, db_url):
															
 
																+    """Output information into a database"""
															
 
																     ctx.obj['output'] = Database(db_url)
															
@@ -34,6 +36,7 @@ def db(ctx, db_url):
 
																 @click.option('-f', '--json_file', type=str, default="/tmp/output.json", show_default=True)
															
 
																 @click.pass_context
															
 
																 def json(ctx, json_file):
															
 
																+    """Output information into a JSON file"""
															
 
																     ctx.obj['output'] = Json(json_file)
															
@@ -41,6 +44,7 @@ def json(ctx, json_file):
 
																 @click.option('-f', '--yaml_file', type=str, default="/tmp/output.yaml", show_default=True)
															
 
																 @click.pass_context
															
 
																 def yaml(ctx, yaml_file):
															
 
																+    """Output information into a YAML file"""
															
 
																     ctx.obj['output'] = Yaml(yaml_file)
															
@@ -58,7 +62,7 @@ def timeline(ctx, limit, user):
 
																 @click.argument('user')
															
 
																 @click.pass_context
															
 
																 def profile(ctx, user):
															
 
																-    """Get basic info about some user"""
															
 
																+    """Get a user's profile information"""
															
 
																     start_node = NodeTimeline(user, limit=1)
															
 
																     strategy = StrategyDumb(start_node, ParserUser)
															
 
																     strategy(ctx.obj['output'])
															
@@ -67,6 +71,7 @@ def profile(ctx, user):
 
																 @db.command()
															
 
																 @click.pass_context
															
 
																 def shell(ctx):
															
 
																+    """Launch an IPython session to interact with the database"""
															
 
																     session = ctx.obj['output'].start()
															
 
																     user_ns = {
															
 
																         'db': ctx.obj['output'],
															
--- a/twhatter/exploration/node/base.py
+++ b/twhatter/exploration/node/base.py
@@ -4,8 +4,9 @@ logger = logging.getLogger(__name__)
 
																 class NodeBase:
															
 
																     """Base class for nodes, which are all the pages that Twitter allows us
															
 
																-    to visit. They can be iterated on, and will yield 'soup' within the limits
															
 
																-    defined at initialization"""
															
 
																+    to visit with a simple HTTP client.
															
 
																+    They behave as generators, and yield `PageElement` data as processed
															
 
																+    by the `BeautifulSoup` library."""
															
 
																     def __init__(self):
															
 
																         logger.debug("Initializing {}".format(self.__class__.__qualname__))
															
--- a/twhatter/exploration/node/timeline.py
+++ b/twhatter/exploration/node/timeline.py
@@ -12,6 +12,9 @@ logger = logging.getLogger(__name__)
 
																 class NodeTimeline(NodeBase):
															
 
																+    """Implementation of the "timeline" node, which is the page accessed by
															
 
																+    https://twitter.com/the_user_name, that can be scrolled until the beginning
															
 
																+    of times."""
															
 
																     user_agent = generate_user_agent(os='linux')
															
 
																     def __init__(self, user, limit=100):
															
@@ -28,8 +31,8 @@ class NodeTimeline(NodeBase):
 
																         logger.info("{} tweets retrieved so far".format(self.nb_tweets))
															
 
																     @classmethod
															
 
																-    def get_user_timeline(cls, user_handle):
															
 
																-        logger.info("Loading initial timeline for {}".format(user_handle))
															
 
																+    def _get_base_page(cls, user_handle):
															
 
																+        logger.info("Loading base page for {}'s timeline".format(user_handle))
															
 
																         url = "https://twitter.com/{}".format(user_handle)
															
 
																         return requests.get(
															
 
																             url,
															
@@ -39,8 +42,8 @@ class NodeTimeline(NodeBase):
 
																             }
															
 
																         )
															
 
																-    def get_more_tweets(self):
															
 
																-        logger.info("Loading more tweets from {}".format(self.user))
															
 
																+    def _scroll(self):
															
 
																+        logger.info("Scrolling in {}'s timeline".format(self.user))
															
 
																         return requests.get(
															
 
																             "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
															
 
																             params= dict(
															
@@ -54,14 +57,14 @@ class NodeTimeline(NodeBase):
 
																     def __iter__(self):
															
 
																         super().__iter__()
															
 
																-        tweets = self.get_user_timeline(self.user)
															
 
																-        soup = BeautifulSoup(tweets.text, "lxml")
															
 
																+        base = self._get_base_page(self.user)
															
 
																+        soup = BeautifulSoup(base.text, "lxml")
															
 
																         self._update_state(soup)
															
 
																         yield soup
															
 
																         while self.nb_tweets < self.limit:
															
 
																-            more_tweets = self.get_more_tweets()
															
 
																-            html = json.loads(more_tweets.content)
															
 
																+            more = self._scroll()
															
 
																+            html = json.loads(more.content)
															
 
																             soup = BeautifulSoup(html['items_html'], "lxml")
															
 
																             if not soup.text:
															
--- a/twhatter/exploration/strategy/base.py
+++ b/twhatter/exploration/strategy/base.py
@@ -9,18 +9,16 @@ class StrategyBase:
 
																     parser_classes = []
															
 
																     def __init__(self, starting_node: 'NodeBase', *parser_classes: 'ParserBase') -> None:
															
 
																-        logger.debug(
															
 
																-            "Initializing {} with starting_node={} and parser_classes={}".format(
															
 
																-                self.__class__.__qualname__,
															
 
																-                starting_node,
															
 
																-                parser_classes
															
 
																-            )
															
 
																-        )
															
 
																+        """
															
 
																+        :param starting_node: the node from which exploration starts
															
 
																+        :param parser_classes: parsers that should be applied on each iteration
															
 
																+        of the node
															
 
																+        """
															
 
																         self.starting_node = starting_node
															
 
																         self.parser_classes = parser_classes
															
 
																     def __call__(self, output) -> None:
															
 
																-        logger.debug("Applying {}".format(self.__class__.__qualname__))
															
 
																+        logger.debug("Applying {}".format(self))
															
 
																     def __repr__(self):
															
 
																         return "<{} (starting_node={}, parsers={})>".format(
															
--- a/twhatter/exploration/strategy/dumb.py
+++ b/twhatter/exploration/strategy/dumb.py
@@ -8,7 +8,8 @@ logger = logging.getLogger(__name__)
 
																 class StrategyDumb(StrategyBase):
															
 
																-    """This strategy only explores the initial node"""
															
 
																+    """This strategy only explores the initial node and scrolls through it
															
 
																+    until exhaustion"""
															
 
																     def __call__(self, output):
															
 
																         super().__call__(output)
															
 
																         output.start()
															
@@ -20,6 +21,7 @@ class StrategyDumb(StrategyBase):
 
																                 logger.debug("Parsing new soup with {}".format(parser))
															
 
																                 for o in parser(s):
															
 
																                     objs.append(o)
															
 
																+
															
 
																         tweets = [t for t in objs if isinstance(t, TweetBase)]
															
 
																         output.output_tweets(tweets)
															
 
																         users = [u for u in objs if isinstance(u, User)]
															
--- a/twhatter/parser/base.py
+++ b/twhatter/parser/base.py
@@ -2,10 +2,11 @@ from typing import Any
 
																 class ParserBase:
															
 
																-    """Base class for a parser, an iterator that yield all elements of a certain
															
 
																-    type within a given page"""
															
 
																+    """Base class for a parser
															
 
																+    A parser behaves like a generators, and yield all elements of a certain
															
 
																+    type within the original page"""
															
 
																     def __init__(self, soup: 'PageElement') -> None:
															
 
																-        pass
															
 
																+        raise NotImplementedError()
															
 
																     def __iter__(self) -> Any:
															
 
																         raise NotImplementedError()
															
--- a/twhatter/parser/media.py
+++ b/twhatter/parser/media.py
@@ -64,7 +64,6 @@ def media_factory(soup: BeautifulSoup) -> Optional[MediaBase]:
 
																 class ParserMedia(ParserBase):
															
 
																     def __init__(self, soup):
															
 
																-        super().__init__(soup)
															
 
																         self.soup = soup
															
 
																     def __iter__(self):
															
--- a/twhatter/parser/tweet.py
+++ b/twhatter/parser/tweet.py
@@ -257,8 +257,6 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 
																 class ParserTweet(ParserBase):
															
 
																     def __init__(self, soup):
															
 
																-        super().__init__(soup)
															
 
																-
															
 
																         # Here 'soup' can be either the full html page as loaded initially,
															
 
																         # or raw HTML incoming from the XHR requests sent when browsing deeper
															
 
																         # in the page, so the strategy is adapted to work in both cases.
															
--- a/twhatter/parser/user.py
+++ b/twhatter/parser/user.py
@@ -91,7 +91,6 @@ class User(ExtractableMixin):
 
																 class ParserUser(ParserBase):
															
 
																     def __init__(self, soup):
															
 
																-        super().__init__(soup)
															
 
																         self.soup = soup
															
 
																     def __iter__(self):