소스 검색

Move HTTP client code into "exploration"

That breaks quite a few things for now but leads to much cleaner code
theenglishway (time) 7 년 전
부모
커밋
00d6fb9ce0

+ 3 - 3
tests/conftest.py

@@ -4,7 +4,7 @@ from datetime import datetime
 from click.testing import CliRunner
 from bs4 import BeautifulSoup
 
-from twhatter.client import ClientTimeline
+from twhatter.exploration import NodeTimeline
 from twhatter.parser import tweet_factory
 from typing import NamedTuple, List
 
@@ -152,8 +152,8 @@ def tweet_collection():
 @pytest.fixture(scope="session")
 def raw_html_user_initial_page_factory():
     def _raw_html_user_initial_page(user):
-        a = ClientTimeline(user)
-        response = a.get_user_timeline(user)
+        n = NodeTimeline(user)
+        response = n.get_user_timeline(user)
         return BeautifulSoup(response.text, "lxml")
     return _raw_html_user_initial_page
 

+ 0 - 9
twhatter/__main__.py

@@ -1,9 +0,0 @@
-from twhatter.client import ClientTimeline
-from twhatter.output import Print
-
-
-user = "the_english_way"
-timeline = ClientTimeline(user)
-
-for t in timeline:
-    Print(t)()

+ 4 - 3
twhatter/cli.py

@@ -9,6 +9,7 @@ from twhatter.output import Print, Json, Database, Yaml
 from twhatter.output.sqlalchemy import Tweet, User
 from twhatter.log import log_setup
 from twhatter.exploration import StrategyDumb, NodeTimeline
+from twhatter.parser import ParserTweet, ParserUser
 
 
 @click.group()
@@ -50,7 +51,7 @@ def yaml(ctx, yaml_file):
 def timeline(ctx, limit, user):
     """Get some user's Tweets"""
     start_node = NodeTimeline(user, limit)
-    strategy = StrategyDumb(start_node)
+    strategy = StrategyDumb(start_node, ParserTweet, ParserUser)
     strategy(ctx.obj['output'])
 
 @main.command()
@@ -58,8 +59,8 @@ def timeline(ctx, limit, user):
 @click.pass_context
 def profile(ctx, user):
     """Get basic info about some user"""
-    start_node = NodeTimeline(user)
-    strategy = StrategyDumb(start_node)
+    start_node = NodeTimeline(user, limit=1)
+    strategy = StrategyDumb(start_node, ParserUser)
     strategy(ctx.obj['output'])
 
 

+ 0 - 88
twhatter/client.py

@@ -1,88 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-from user_agent import generate_user_agent
-
-from twhatter.parser import TweetList, user_factory
-import json
-import logging
-
-
-logger = logging.getLogger(__name__)
-
-
-class Client():
-    user_agent = generate_user_agent(os='linux')
-
-    @classmethod
-    def get_user_timeline(cls, user_handle):
-        logger.info("Loading initial timeline for {}".format(user_handle))
-        url = "https://twitter.com/{}".format(user_handle)
-        return requests.get(
-            url,
-            headers={
-                'User-Agent': cls.user_agent,
-                'Accept-Language': 'en'
-            }
-        )
-
-
-class ClientTimeline(Client):
-    """Access and explore some user's timeline"""
-    def __init__(self, user, limit=100):
-        self.user = user
-        self.earliest_tweet = None
-        self.nb_tweets = 0
-        self.limit = limit
-
-    def _update_state(self, earliest_tweet):
-        self.earliest_tweet = earliest_tweet.id
-        self.nb_tweets += 1
-
-    def get_more_tweets(self):
-        logger.info(
-            "Loading more tweets from {} ({})".format(self.user, self.nb_tweets)
-        )
-        return requests.get(
-            "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
-            params= dict(
-                include_available_features=1,
-                include_entities=1,
-                max_position=self.earliest_tweet,
-                reset_error_state=False
-            ),
-            headers={'User-Agent': self.user_agent}
-        )
-
-    def __iter__(self):
-        tweets = self.get_user_timeline(self.user)
-        soup = BeautifulSoup(tweets.text, "lxml")
-        t_list = TweetList(soup)
-
-        for t in t_list:
-            yield t
-            self._update_state(t)
-            if self.nb_tweets >= self.limit:
-                break
-
-        while True and self.nb_tweets < self.limit:
-            more_tweets = self.get_more_tweets()
-            html = json.loads(more_tweets.content)
-            soup = BeautifulSoup(html['items_html'], "lxml")
-            t_list = TweetList(soup)
-
-            if len(t_list) == 0:
-                break
-
-            for t in t_list:
-                yield t
-                self._update_state(t)
-
-
-class ClientProfile(Client):
-    """Get profile information about an user"""
-    def __init__(self, user_handle):
-        self.user_handle = user_handle
-        user_page = self.get_user_timeline(user_handle)
-        soup = BeautifulSoup(user_page.text, "lxml")
-
-        self.user = user_factory(soup)

+ 2 - 4
twhatter/exploration/node/base.py

@@ -4,12 +4,10 @@ logger = logging.getLogger(__name__)
 
 class NodeBase:
     """Base class for nodes, which are all the pages that Twitter allows us
-    to visit. They can be iterated on, and will yield data within the limits
+    to visit. They can be iterated on, and will yield 'soup' within the limits
     defined at initialization"""
     def __init__(self):
         logger.debug("Initializing {}".format(self.__class__.__qualname__))
 
-    # TODO: there should be one function per kind of object (iter_tweets,
-    #  iter_users, ...)
-    def __iter__(self):
+    def __iter__(self) -> 'PageElement':
         logger.debug("Iterating on {}".format(self.__class__.__qualname__))

+ 67 - 3
twhatter/exploration/node/timeline.py

@@ -1,13 +1,77 @@
 import logging
+import json
+
+import requests
+from bs4 import BeautifulSoup
+from user_agent import generate_user_agent
+
 from .base import NodeBase
-from twhatter.client import ClientTimeline
+from twhatter.parser import ParserTweet
+
+logger = logging.getLogger(__name__)
 
 
 class NodeTimeline(NodeBase):
+    user_agent = generate_user_agent(os='linux')
+
     def __init__(self, user, limit=100):
         super().__init__()
-        self.client = ClientTimeline(user, limit)
+        self.user = user
+        self.earliest_tweet_id = None
+        self.nb_tweets = 0
+        self.limit = limit
+
+    def _update_state(self, soup):
+        tweets = ParserTweet(soup)
+        self.nb_tweets += len(tweets)
+        *_, self.earliest_tweet_id = (t.id for t in tweets)
+
+    @classmethod
+    def get_user_timeline(cls, user_handle):
+        logger.info("Loading initial timeline for {}".format(user_handle))
+        url = "https://twitter.com/{}".format(user_handle)
+        return requests.get(
+            url,
+            headers={
+                'User-Agent': cls.user_agent,
+                'Accept-Language': 'en'
+            }
+        )
+
+    def get_more_tweets(self):
+        logger.info("Loading more tweets from {}".format(self.user))
+        return requests.get(
+            "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
+            params= dict(
+                include_available_features=1,
+                include_entities=1,
+                max_position=self.earliest_tweet_id,
+                reset_error_state=False
+            ),
+            headers={'User-Agent': self.user_agent}
+        )
 
     def __iter__(self):
         super().__iter__()
-        yield from self.client
+        tweets = self.get_user_timeline(self.user)
+        soup = BeautifulSoup(tweets.text, "lxml")
+        self._update_state(soup)
+        yield soup
+
+        while True and self.nb_tweets < self.limit:
+            more_tweets = self.get_more_tweets()
+            html = json.loads(more_tweets.content)
+
+            soup = BeautifulSoup(html['items_html'], "lxml")
+            if not soup.text:
+                break
+
+            self._update_state(soup)
+            yield soup
+
+    def __repr__(self):
+        return "<{} (user={}, limit={})>".format(
+            self.__class__.__qualname__,
+            self.user,
+            self.limit
+        )

+ 16 - 3
twhatter/exploration/strategy/base.py

@@ -1,17 +1,30 @@
 import logging
+from typing import List
 
 logger = logging.getLogger(__name__)
 
 class StrategyBase:
     """Base class for strategies, which define a way to explore Tweeter pages"""
-    def __init__(self, starting_node: 'NodeBase') -> None:
+    starting_node = None
+    parser_classes = []
+
+    def __init__(self, starting_node: 'NodeBase', *parser_classes: 'ParserBase') -> None:
         logger.debug(
-            "Initializing {} with starting_node={}".format(
+            "Initializing {} with starting_node={} and parser_classes={}".format(
                 self.__class__.__qualname__,
-                starting_node
+                starting_node,
+                parser_classes
             )
         )
         self.starting_node = starting_node
+        self.parser_classes = parser_classes
 
     def __call__(self, output) -> None:
         logger.debug("Applying {}".format(self.__class__.__qualname__))
+
+    def __repr__(self):
+        return "<{} (starting_node={}, parsers={})>".format(
+            self.__class__.__qualname__,
+            self.starting_node,
+            self.parser_classes
+        )

+ 13 - 2
twhatter/exploration/strategy/dumb.py

@@ -1,16 +1,27 @@
+import logging
+
 from .base import StrategyBase
 from twhatter.parser import TweetBase, User
 
 
+logger = logging.getLogger(__name__)
+
+
 class StrategyDumb(StrategyBase):
     """This strategy only explores the initial node"""
     def __call__(self, output):
         super().__call__(output)
         output.start()
 
-        tweets = [t for t in self.starting_node if isinstance(t, TweetBase)]
+        objs = []
+        for s in self.starting_node:
+            for parser in self.parser_classes:
+                logger.debug("Parsing new data with {}".format(parser))
+                for o in parser(s):
+                    objs.append(o)
+        tweets = [t for t in objs if isinstance(t, TweetBase)]
         output.output_tweets(tweets)
-        users = [u for u in self.starting_node if isinstance(u, User)]
+        users = [u for u in objs if isinstance(u, User)]
         output.output_users(users)
 
         output.stop()

+ 4 - 0
twhatter/log.py

@@ -33,17 +33,21 @@ def log_setup(verbosity):
         logging.getLogger('twhatter.client').setLevel(logging.DEBUG)
         logging.getLogger('twhatter.parser').setLevel(logging.DEBUG)
         logging.getLogger('twhatter.output').setLevel(logging.DEBUG)
+        logging.getLogger('twhatter.exploration').setLevel(logging.DEBUG)
     elif verbosity == 'debug':
         logging.getLogger('twhatter.client').setLevel(logging.DEBUG)
         logging.getLogger('twhatter.parser').setLevel(logging.INFO)
         logging.getLogger('twhatter.output').setLevel(logging.INFO)
+        logging.getLogger('twhatter.exploration').setLevel(logging.INFO)
     elif verbosity == 'info':
         logging.getLogger('twhatter.client').setLevel(logging.INFO)
         logging.getLogger('twhatter.parser').setLevel(logging.INFO)
         logging.getLogger('twhatter.output').setLevel(logging.INFO)
+        logging.getLogger('twhatter.exploration').setLevel(logging.INFO)
     elif verbosity == 'none':
         logging.getLogger('twhatter.client').setLevel(logging.WARNING)
         logging.getLogger('twhatter.parser').setLevel(logging.WARNING)
         logging.getLogger('twhatter.output').setLevel(logging.WARNING)
+        logging.getLogger('twhatter.exploration').setLevel(logging.WARNING)
 
     logging.config.dictConfig(LOGGING)

+ 0 - 1
twhatter/output/sqlalchemy/db.py

@@ -6,7 +6,6 @@ from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.exc import IntegrityError
 
 from twhatter.output import OutputBase
-from twhatter.client import ClientTimeline, ClientProfile
 
 
 # Registry of SQLAlchemy's models

+ 8 - 5
twhatter/parser/__init__.py

@@ -1,8 +1,8 @@
-from .tweet import (TweetList, TweetBase,
+from .tweet import (TweetList, TweetBase, ParserTweet,
                     tweet_factory,
-                    TweetTextOnly, TweetLink, TweetReaction, TweetRetweet)
-from .user import User, user_factory
-from .media import MediaBase, MediaImage, media_factory
+                    TweetTextOnly, TweetLink, TweetReaction, TweetRetweet,)
+from .user import User, user_factory, ParserUser
+from .media import MediaBase, MediaImage, media_factory, ParserMedia
 
 __all__= [
     "TweetList",
@@ -12,11 +12,14 @@ __all__= [
     "TweetLink",
     "TweetReaction",
     "TweetRetweet",
+    "ParserTweet",
 
     "User",
     "user_factory",
+    "ParserUser",
 
     "MediaBase",
     "MediaImage",
-    "media_factory"
+    "media_factory",
+    "ParserMedia",
 ]

+ 14 - 0
twhatter/parser/base.py

@@ -0,0 +1,14 @@
+from typing import Any
+
+
+class ParserBase:
+    """Base class for a parser, an iterator that yield all elements of a certain
+    type within a given page"""
+    def __init__(self, soup: 'PageElement') -> None:
+        pass
+
+    def __iter__(self) -> Any:
+        raise NotImplementedError()
+
+    def __repr__(self):
+        return "<{} id={}>".format(self.__class__.__qualname__, id(self))

+ 26 - 0
twhatter/parser/media.py

@@ -5,6 +5,7 @@ from dataclasses import dataclass, fields, InitVar, field
 from typing import List, Optional
 
 from .mixins import ExtractableMixin
+from .base import ParserBase
 
 
 logger = logging.getLogger(__name__)
@@ -59,3 +60,28 @@ def media_factory(soup: BeautifulSoup) -> Optional[MediaBase]:
             continue
 
     return None
+
+
+class ParserMedia(ParserBase):
+    def __init__(self, soup):
+        super().__init__(soup)
+        self.soup = soup
+
+    def __iter__(self):
+        kwargs = {
+            f.name: MediaBase._extract_value(self.soup, f) for f in fields(MediaBase)
+        }
+
+        for kls in MediaBase.__subclasses__():
+            try:
+                if kls.condition(kwargs):
+                    m = kls(soup=self.soup, **kwargs)
+                    logger.debug("Parsed media {}".format(m))
+                    return m
+            except NotImplementedError:
+                continue
+
+        return None
+
+    def __len__(self):
+        return 1

+ 7 - 8
twhatter/parser/mixins.py

@@ -23,17 +23,16 @@ class ExtractableMixin:
     @staticmethod
     def _extract_from_div(soup, div_class, data_kw):
         kw = "data-{}".format(data_kw)
-        return(
-            soup.find('div', class_=div_class, attrs={kw: True})[kw]
-        )
+        tag = soup.find('div', class_=div_class, attrs={kw: True})
+
+        return tag[kw] if tag else None
 
     @staticmethod
     def _extract_from_span(soup, distinct_span, data_kw):
-        return (
-            soup.find('span', distinct_span)
-                .find('span', attrs={data_kw: True})
-            [data_kw]
-        )
+        tag = (soup.find('span', distinct_span)
+                   .find('span', attrs={data_kw: True}))
+
+        return tag[data_kw] if tag else None
 
     @staticmethod
     def extract_soup(soup):

+ 19 - 0
twhatter/parser/tweet.py

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 from dataclasses import dataclass, fields, InitVar, field
 from typing import List
 
+from .base import ParserBase
 from .mixins import ExtractableMixin
 from .media import MediaBase, media_factory
 
@@ -272,3 +273,21 @@ class TweetList:
 
     def __len__(self):
         return len(self.raw_tweets)
+
+
+class ParserTweet(ParserBase):
+    def __init__(self, soup):
+        super().__init__(soup)
+        self.raw_tweets = soup.find_all('li', 'stream-item')
+
+    def __iter__(self):
+        for tweet in self.raw_tweets:
+            # Don't know what this u-dir stuff is about but if it's in there,
+            # it's not a tweet !
+            if not tweet.find_all('p', class_="u-dir"):
+                t = tweet_factory(tweet)
+                logger.debug("Parsed tweet {}".format(t))
+                yield t
+
+    def __len__(self):
+        return len(self.raw_tweets)

+ 28 - 1
twhatter/parser/user.py

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 from dataclasses import dataclass, fields, InitVar
 
 from .mixins import ExtractableMixin
+from .base import ParserBase
 
 
 logger = logging.getLogger(__name__)
@@ -43,7 +44,11 @@ class User(ExtractableMixin):
 
     @classmethod
     def extract_id(cls, soup):
-        return int(cls._extract_from_div(soup, 'ProfileNav', 'user-id'))
+        id_str = cls._extract_from_div(soup, 'ProfileNav', 'user-id')
+        if not id_str:
+            raise ValueError("No id could be found")
+
+        return int(id_str)
 
     @classmethod
     def extract_fullname(cls, soup):
@@ -91,3 +96,25 @@ def user_factory(soup: BeautifulSoup) -> User:
     u = User(**kwargs)
     logger.debug("Parsed user {}".format(u))
     return u
+
+
+class ParserUser(ParserBase):
+    def __init__(self, soup):
+        super().__init__(soup)
+        self.soup = soup
+
+    def __iter__(self):
+        try:
+            kwargs = {
+                f.name: User._extract_value(self.soup, f) for f in fields(User)
+            }
+        except ValueError:
+            logger.debug("Soup contained no data for {}".format(self))
+            return
+
+        u = User(**kwargs)
+        logger.debug("Parsed user {}".format(u))
+        yield u
+
+    def __len__(self):
+        return 1