Browse Source

Add "exploration" module (breaking)

The "exploration" module is now in charge of fetching data from the client
and sending it to the output module. It is in charge of deciding how
Twitter should be explored from a given starting point (e.g.
whether we fetch the profiles from the users that appear in a given
timeline)
Output API is now completely dumb and only execute orders from the
"exploration" module

As a result, Db output is temporarily broken (print, json & yaml outputs
have been updated to match this new API)
theenglishway (time) 7 years ago
parent
commit
a258bdf2dc

+ 7 - 2
twhatter/cli.py

@@ -8,6 +8,7 @@ import IPython
 from twhatter.output import Print, Json, Database, Yaml
 from twhatter.output.sqlalchemy import Tweet, User
 from twhatter.log import log_setup
+from twhatter.exploration import StrategyDumb, NodeTimeline, NodeProfile
 
 
 @click.group()
@@ -48,14 +49,18 @@ def yaml(ctx, yaml_file):
 @click.pass_context
 def timeline(ctx, limit, user):
     """Get some user's Tweets"""
-    ctx.obj['output'].output_tweets(user, limit)
+    start_node = NodeTimeline(user, limit)
+    strategy = StrategyDumb(start_node)
+    strategy(ctx.obj['output'])
 
 @main.command()
 @click.argument('user')
 @click.pass_context
 def profile(ctx, user):
     """Get basic info about some user"""
-    ctx.obj['output'].output_user(user)
+    start_node = NodeProfile(user)
+    strategy = StrategyDumb(start_node)
+    strategy(ctx.obj['output'])
 
 
 @db.command()

+ 2 - 0
twhatter/exploration/__init__.py

@@ -0,0 +1,2 @@
+from .node import *
+from .strategy import *

+ 7 - 0
twhatter/exploration/node/__init__.py

@@ -0,0 +1,7 @@
+from .timeline import NodeTimeline
+from .profile import NodeProfile
+
+__all__ = [
+    'NodeTimeline',
+    'NodeProfile',
+]

+ 15 - 0
twhatter/exploration/node/base.py

@@ -0,0 +1,15 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+class NodeBase:
+    """Base class for nodes, which are all the pages that Twitter allows us
+    to visit. They can be iterated on, and will yield data within the limits
+    defined at initialization"""
+    def __init__(self):
+        logger.debug("Initializing {}".format(self.__class__.__qualname__))
+
+    # TODO: there should be one function per kind of object (iter_tweets,
+    #  iter_users, ...)
+    def __iter__(self):
+        logger.debug("Iterating on {}".format(self.__class__.__qualname__))

+ 13 - 0
twhatter/exploration/node/profile.py

@@ -0,0 +1,13 @@
+import logging
+from .base import NodeBase
+from twhatter.client import ClientProfile
+
+
+class NodeProfile(NodeBase):
+    def __init__(self, user):
+        super().__init__()
+        self.client = ClientProfile(user)
+
+    def __iter__(self):
+        super().__iter__()
+        yield self.client.user

+ 13 - 0
twhatter/exploration/node/timeline.py

@@ -0,0 +1,13 @@
+import logging
+from .base import NodeBase
+from twhatter.client import ClientTimeline
+
+
+class NodeTimeline(NodeBase):
+    def __init__(self, user, limit=100):
+        super().__init__()
+        self.client = ClientTimeline(user, limit)
+
+    def __iter__(self):
+        super().__iter__()
+        yield from self.client

+ 5 - 0
twhatter/exploration/strategy/__init__.py

@@ -0,0 +1,5 @@
+from .dumb import StrategyDumb
+
+__all__ = [
+    'StrategyDumb',
+]

+ 17 - 0
twhatter/exploration/strategy/base.py

@@ -0,0 +1,17 @@
+import logging
+
+logger = logging.getLogger(__name__)
+
+class StrategyBase:
+    """Base class for strategies, which define a way to explore Tweeter pages"""
+    def __init__(self, starting_node: 'NodeBase') -> None:
+        logger.debug(
+            "Initializing {} with starting_node={}".format(
+                self.__class__.__qualname__,
+                starting_node
+            )
+        )
+        self.starting_node = starting_node
+
+    def __call__(self, output) -> None:
+        logger.debug("Applying {}".format(self.__class__.__qualname__))

+ 16 - 0
twhatter/exploration/strategy/dumb.py

@@ -0,0 +1,16 @@
+from .base import StrategyBase
+from twhatter.parser import TweetBase, User
+
+
+class StrategyDumb(StrategyBase):
+    """This strategy only explores the initial node"""
+    def __call__(self, output):
+        super().__call__(output)
+        output.start()
+
+        tweets = [t for t in self.starting_node if isinstance(t, TweetBase)]
+        output.output_tweets(tweets)
+        users = [u for u in self.starting_node if isinstance(u, User)]
+        output.output_users(users)
+
+        output.stop()

+ 12 - 3
twhatter/output/base.py

@@ -1,10 +1,19 @@
+from typing import List
+
+
 class OutputBase:
     """Base class for scraper's data output"""
-    def output_tweets(self, user, limit) -> None:
+    def start(self):
+        pass
+
+    def output_tweets(self, tweets: List['TweetBase']) -> None:
         raise NotImplementedError()
 
-    def output_user(self, user) -> None:
+    def output_users(self, users: List['User']) -> None:
         raise NotImplementedError()
 
-    def output_medias(self, user) -> None:
+    def output_medias(self, medias: List['MediaBase']) -> None:
         raise NotImplementedError()
+
+    def stop(self):
+        pass

+ 7 - 9
twhatter/output/json.py

@@ -4,7 +4,6 @@ from datetime import datetime
 from bs4 import PageElement
 
 from .base import OutputBase
-from twhatter.client import ClientTimeline, ClientProfile
 
 
 logger = logging.getLogger(__name__)
@@ -25,15 +24,14 @@ class Json(OutputBase):
     def __init__(self, json_path):
         logger.info("Output set to {}".format(json_path))
         self.json_path = json_path
+        self.all_objects = []
 
-    def output_tweets(self, user, limit):
-        client_timeline = ClientTimeline(user, limit)
+    def output_tweets(self, tweets):
+        self.all_objects += tweets
 
-        with open(self.json_path, 'w') as f:
-            json.dump([t for t in client_timeline], f, cls=TweeterEncoder, indent=4)
-
-    def output_user(self, user):
-        p = ClientProfile(user)
+    def output_users(self, users):
+        self.all_objects += users
 
+    def stop(self):
         with open(self.json_path, 'w') as f:
-            json.dump(p.user, f, cls=TweeterEncoder, indent=4)
+            json.dump([o for o in self.all_objects], f, cls=TweeterEncoder, indent=4)

+ 5 - 8
twhatter/output/print.py

@@ -1,14 +1,11 @@
 from .base import OutputBase
-from twhatter.client import ClientTimeline, ClientProfile
 
 
 class Print(OutputBase):
-    def output_tweets(self, user, limit):
-        client_timeline = ClientTimeline(user, limit)
-
-        for t in client_timeline:
+    def output_tweets(self, tweets):
+        for t in tweets:
             print(t)
 
-    def output_user(self, user):
-        p = ClientProfile(user)
-        print(p.user)
+    def output_users(self, users):
+        for u in users:
+            print(u)

+ 2 - 2
twhatter/output/sqlalchemy/db.py

@@ -45,7 +45,7 @@ class Database(OutputBase):
             session.rollback()
             return 0
 
-    def output_tweets(self, user, limit):
+    def output_tweets(self, tweets):
         client_timeline = ClientTimeline(user, limit)
         Tweet = class_registry['Tweet']
         User = class_registry['User']
@@ -72,7 +72,7 @@ class Database(OutputBase):
 
         self.stop(session)
 
-    def output_user(self, user):
+    def output_users(self, users):
         User = class_registry['User']
         p = ClientProfile(user)
         session = self.start()

+ 7 - 9
twhatter/output/yaml.py

@@ -3,7 +3,6 @@ import logging
 from bs4 import PageElement
 
 from .base import OutputBase
-from twhatter.client import ClientTimeline, ClientProfile
 
 
 logger = logging.getLogger(__name__)
@@ -20,15 +19,14 @@ class Yaml(OutputBase):
     def __init__(self, yaml_path):
         logger.info("Output set to {}".format(yaml_path))
         self.yaml_path = yaml_path
+        self.all_objects = []
 
-    def output_tweets(self, user, limit):
-        client_timeline = ClientTimeline(user, limit)
+    def output_tweets(self, tweets):
+        self.all_objects += tweets
 
-        with open(self.yaml_path, 'w') as f:
-            yaml.dump([t for t in client_timeline], f, indent=2)
-
-    def output_user(self, user):
-        p = ClientProfile(user)
+    def output_users(self, users):
+        self.all_objects += users
 
+    def stop(self):
         with open(self.yaml_path, 'w') as f:
-            yaml.dump(p.user, f, indent=2, default_flow_style=False)
+            yaml.dump([u for u in self.all_objects], f, indent=2, default_flow_style=False)