Переглянути джерело

Some documentation and renaming

theenglishway (time) 7 роки тому
батько
коміт
63e4e4b600

+ 5 - 0
HISTORY.rst

@@ -2,6 +2,11 @@
 History
 =======
 
+0.2.0 (2019-01-23)
+------------------
+
+* First proper release.
+
 0.1.0 (2019-01-09)
 ------------------
 

+ 20 - 6
README.rst

@@ -15,8 +15,8 @@ anonymous browsing session.
 
 This is mostly an attempt for me to produce some clean, functional and
 maintainable Python code. I have especially focused on a clean separation
-between data retrieval and output, which should allow to export data in any
-format.
+between data retrieval, Twitter pages exploration, and output, which allows
+to easily define and combine various crawling strategies and data formats.
 
 And why that terrible name ? Simple, "WHAT's going on TWITTER ?" => TWHATTER !
 
@@ -34,8 +34,11 @@ Anonymous client
 Data output
 ***********
 
-All scraped information can either be displayed on the terminal or stored into
-a local database.
+All scraped information can either be :
+
+* displayed on the terminal,
+* stored into a JSON / YAML file
+* stored into a local database.
 
 Installation
 ------------
@@ -44,7 +47,7 @@ Installation requires Python >= 3.6. ::
 
     $ pip install --user git+https://code.theenglishway.eu/theenglishway-corp/twhatter
 
-You then have to ensure that `~/.local/bin` in your `$PATH` or call
+You then have to ensure that `~/.local/bin` is in your `$PATH` or call
 `~/.local/bin/twhatter` instead of `twhatter` in the following examples
 
 Usage
@@ -52,12 +55,13 @@ Usage
 
 Display some user's tweets ::
 
-    $ twhatter timeline realDonaldTrump --limit 10
+    $ twhatter timeline realDonaldTrump --limit 40
     <TweetTextOnly (id=1083404900862545920, date=2019-01-10 16:47:11, likes=32033, likes=11087, likes=6935)>
     <TweetTextOnly (id=1083358775925460992, date=2019-01-10 13:43:54, likes=96565, likes=22596, likes=26802)>
     <TweetTextOnly (id=1083358611315789826, date=2019-01-10 13:43:15, likes=52849, likes=9344, likes=9571)>
     <TweetTextOnly (id=1083358150214979585, date=2019-01-10 13:41:25, likes=48808, likes=11096, likes=11499)>
     <TweetTextOnly (id=1083356326833602561, date=2019-01-10 13:34:10, likes=50695, likes=11743, likes=11045)>
+    ...
     <TweetTextOnly (id=1083353895030702080, date=2019-01-10 13:24:30, likes=85184, likes=19686, likes=27751)>
     <TweetRetweet (id=1083121283645272064, date=2019-01-09 22:00:12, likes=42640, likes=13189, likes=10242)>
     <TweetRetweet (id=1082774275390693376, date=2019-01-08 23:01:18, likes=52776, likes=14459, likes=2403)>
@@ -69,6 +73,11 @@ Display their profile information ::
     $ twhatter profile realDonaldTrump
     User(id=25073877, fullname='Donald J. Trump', join_date=datetime.datetime(2009, 3, 18, 0, 0), tweets_nb=40183, following_nb=45, followers_nb=57144827, likes_nb=7)
 
+Put them into a JSON/YAML file ::
+
+    $ twhatter json timeline realDonaldTrump
+    $ twhatter yaml profile realDonaldTrump
+
 Put them into a local database (by default in /tmp/db.sqlite) ::
 
     $ twhatter db timeline realDonaldTrump
@@ -86,6 +95,11 @@ Open a session on the local database and make queries with SQLAlchemy ::
      <Tweet (id=1026482814164844544),
      <Tweet (id=1027797734613504001)]
 
+In all cases the help is here ::
+
+    $ twhatter --help
+
+
 Tests
 -----
 

+ 1 - 1
tests/conftest.py

@@ -148,7 +148,7 @@ def tweet_collection():
 def raw_html_user_initial_page_factory():
     def _raw_html_user_initial_page(user):
         n = NodeTimeline(user)
-        response = n.get_user_timeline(user)
+        response = n._get_base_page(user)
         return BeautifulSoup(response.text, "lxml")
     return _raw_html_user_initial_page
 

+ 6 - 1
twhatter/cli.py

@@ -18,6 +18,7 @@ from twhatter.parser import ParserTweet, ParserUser
               default='info', show_default=True)
 @click.pass_context
 def main(ctx, verbosity):
+    """Output various information from Twitter"""
     log_setup(verbosity)
     ctx.ensure_object(dict)
     ctx.obj['output'] = Print()
@@ -27,6 +28,7 @@ def main(ctx, verbosity):
 @click.option('-d', '--db_url', type=str, default="sqlite:////tmp/db.sqlite3", show_default=True)
 @click.pass_context
 def db(ctx, db_url):
+    """Output information into a database"""
     ctx.obj['output'] = Database(db_url)
 
 
@@ -34,6 +36,7 @@ def db(ctx, db_url):
 @click.option('-f', '--json_file', type=str, default="/tmp/output.json", show_default=True)
 @click.pass_context
 def json(ctx, json_file):
+    """Output information into a JSON file"""
     ctx.obj['output'] = Json(json_file)
 
 
@@ -41,6 +44,7 @@ def json(ctx, json_file):
 @click.option('-f', '--yaml_file', type=str, default="/tmp/output.yaml", show_default=True)
 @click.pass_context
 def yaml(ctx, yaml_file):
+    """Output information into a YAML file"""
     ctx.obj['output'] = Yaml(yaml_file)
 
 
@@ -58,7 +62,7 @@ def timeline(ctx, limit, user):
 @click.argument('user')
 @click.pass_context
 def profile(ctx, user):
-    """Get basic info about some user"""
+    """Get a user's profile information"""
     start_node = NodeTimeline(user, limit=1)
     strategy = StrategyDumb(start_node, ParserUser)
     strategy(ctx.obj['output'])
@@ -67,6 +71,7 @@ def profile(ctx, user):
 @db.command()
 @click.pass_context
 def shell(ctx):
+    """Launch an IPython session to interact with the database"""
     session = ctx.obj['output'].start()
     user_ns = {
         'db': ctx.obj['output'],

+ 3 - 2
twhatter/exploration/node/base.py

@@ -4,8 +4,9 @@ logger = logging.getLogger(__name__)
 
 class NodeBase:
     """Base class for nodes, which are all the pages that Twitter allows us
-    to visit. They can be iterated on, and will yield 'soup' within the limits
-    defined at initialization"""
+    to visit with a simple HTTP client.
+    They behave as generators, and yield `PageElement` data as processed
+    by the `BeautifulSoup` library."""
     def __init__(self):
         logger.debug("Initializing {}".format(self.__class__.__qualname__))
 

+ 11 - 8
twhatter/exploration/node/timeline.py

@@ -12,6 +12,9 @@ logger = logging.getLogger(__name__)
 
 
 class NodeTimeline(NodeBase):
+    """Implementation of the "timeline" node, which is the page accessed by
+    https://twitter.com/the_user_name, that can be scrolled until the beginning
+    of times."""
     user_agent = generate_user_agent(os='linux')
 
     def __init__(self, user, limit=100):
@@ -28,8 +31,8 @@ class NodeTimeline(NodeBase):
         logger.info("{} tweets retrieved so far".format(self.nb_tweets))
 
     @classmethod
-    def get_user_timeline(cls, user_handle):
-        logger.info("Loading initial timeline for {}".format(user_handle))
+    def _get_base_page(cls, user_handle):
+        logger.info("Loading base page for {}'s timeline".format(user_handle))
         url = "https://twitter.com/{}".format(user_handle)
         return requests.get(
             url,
@@ -39,8 +42,8 @@ class NodeTimeline(NodeBase):
             }
         )
 
-    def get_more_tweets(self):
-        logger.info("Loading more tweets from {}".format(self.user))
+    def _scroll(self):
+        logger.info("Scrolling in {}'s timeline".format(self.user))
         return requests.get(
             "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
             params= dict(
@@ -54,14 +57,14 @@ class NodeTimeline(NodeBase):
 
     def __iter__(self):
         super().__iter__()
-        tweets = self.get_user_timeline(self.user)
-        soup = BeautifulSoup(tweets.text, "lxml")
+        base = self._get_base_page(self.user)
+        soup = BeautifulSoup(base.text, "lxml")
         self._update_state(soup)
         yield soup
 
         while self.nb_tweets < self.limit:
-            more_tweets = self.get_more_tweets()
-            html = json.loads(more_tweets.content)
+            more = self._scroll()
+            html = json.loads(more.content)
 
             soup = BeautifulSoup(html['items_html'], "lxml")
             if not soup.text:

+ 6 - 8
twhatter/exploration/strategy/base.py

@@ -9,18 +9,16 @@ class StrategyBase:
     parser_classes = []
 
     def __init__(self, starting_node: 'NodeBase', *parser_classes: 'ParserBase') -> None:
-        logger.debug(
-            "Initializing {} with starting_node={} and parser_classes={}".format(
-                self.__class__.__qualname__,
-                starting_node,
-                parser_classes
-            )
-        )
+        """
+        :param starting_node: the node from which exploration starts
+        :param parser_classes: parsers that should be applied on each iteration
+        of the node
+        """
         self.starting_node = starting_node
         self.parser_classes = parser_classes
 
     def __call__(self, output) -> None:
-        logger.debug("Applying {}".format(self.__class__.__qualname__))
+        logger.debug("Applying {}".format(self))
 
     def __repr__(self):
         return "<{} (starting_node={}, parsers={})>".format(

+ 3 - 1
twhatter/exploration/strategy/dumb.py

@@ -8,7 +8,8 @@ logger = logging.getLogger(__name__)
 
 
 class StrategyDumb(StrategyBase):
-    """This strategy only explores the initial node"""
+    """This strategy only explores the initial node and scrolls through it
+    until exhaustion"""
     def __call__(self, output):
         super().__call__(output)
         output.start()
@@ -20,6 +21,7 @@ class StrategyDumb(StrategyBase):
                 logger.debug("Parsing new soup with {}".format(parser))
                 for o in parser(s):
                     objs.append(o)
+
         tweets = [t for t in objs if isinstance(t, TweetBase)]
         output.output_tweets(tweets)
         users = [u for u in objs if isinstance(u, User)]

+ 4 - 3
twhatter/parser/base.py

@@ -2,10 +2,11 @@ from typing import Any
 
 
 class ParserBase:
-    """Base class for a parser, an iterator that yield all elements of a certain
-    type within a given page"""
+    """Base class for a parser
+    A parser behaves like a generators, and yield all elements of a certain
+    type within the original page"""
     def __init__(self, soup: 'PageElement') -> None:
-        pass
+        raise NotImplementedError()
 
     def __iter__(self) -> Any:
         raise NotImplementedError()

+ 0 - 1
twhatter/parser/media.py

@@ -64,7 +64,6 @@ def media_factory(soup: BeautifulSoup) -> Optional[MediaBase]:
 
 class ParserMedia(ParserBase):
     def __init__(self, soup):
-        super().__init__(soup)
         self.soup = soup
 
     def __iter__(self):

+ 0 - 2
twhatter/parser/tweet.py

@@ -257,8 +257,6 @@ def tweet_factory(soup: BeautifulSoup) -> TweetBase:
 
 class ParserTweet(ParserBase):
     def __init__(self, soup):
-        super().__init__(soup)
-
         # Here 'soup' can be either the full html page as loaded initially,
         # or raw HTML incoming from the XHR requests sent when browsing deeper
         # in the page, so the strategy is adapted to work in both cases.

+ 0 - 1
twhatter/parser/user.py

@@ -91,7 +91,6 @@ class User(ExtractableMixin):
 
 class ParserUser(ParserBase):
     def __init__(self, soup):
-        super().__init__(soup)
         self.soup = soup
 
     def __iter__(self):