theenglishway (time) 6 роки тому
батько
коміт
0401db6db7

Різницю між файлами не показано, бо вона завелика
+ 2605 - 0
tweeets.yaml


+ 3 - 2
twhatter/cli.py

@@ -8,7 +8,7 @@ import IPython
 from twhatter.output import Print, Json, Database, Yaml
 from twhatter.output.sqlalchemy import Tweet, User
 from twhatter.log import log_setup
-from twhatter.exploration import StrategyDumb, NodeTimeline
+from twhatter.exploration import StrategyDumb, NodeTimeline, NodeProfile, Session
 from twhatter.parser import ParserTweet, ParserUser
 
 
@@ -63,7 +63,8 @@ def timeline(ctx, limit, user):
 @click.pass_context
 def profile(ctx, user):
     """Get a user's profile information"""
-    start_node = NodeTimeline(user, limit=1)
+    #session = Session("theenglishway", "")
+    start_node = NodeProfile(user, limit=30)
     strategy = StrategyDumb(start_node, ParserUser)
     strategy(ctx.obj['output'])
 

+ 1 - 0
twhatter/exploration/__init__.py

@@ -1,2 +1,3 @@
 from .node import *
 from .strategy import *
+from .session import Session

+ 4 - 0
twhatter/exploration/node/__init__.py

@@ -1,5 +1,9 @@
 from .timeline import NodeTimeline
+from .profile import NodeProfile
+from .login import NodeLogin
 
 __all__ = [
     'NodeTimeline',
+    'NodeProfile',
+    'NodeLogin'
 ]

+ 2 - 1
twhatter/exploration/node/base.py

@@ -7,8 +7,9 @@ class NodeBase:
     to visit with a simple HTTP client.
     They behave as generators, and yield `PageElement` data as processed
     by the `BeautifulSoup` library."""
-    def __init__(self):
+    def __init__(self, session=None):
         logger.debug("Initializing {}".format(self.__class__.__qualname__))
+        self._session = session
 
     def __iter__(self) -> 'PageElement':
         logger.debug("Iterating on {}".format(self.__class__.__qualname__))

+ 61 - 0
twhatter/exploration/node/login.py

@@ -0,0 +1,61 @@
+import logging
+import json
+
+import requests
+from bs4 import BeautifulSoup
+from user_agent import generate_user_agent
+
+from .base import NodeBase
+from twhatter.parser import ParserTweet
+
+logger = logging.getLogger(__name__)
+
+
+class NodeLogin(NodeBase):
+    """Implementation of the "timeline" node, which is the page accessed by
+    https://twitter.com/the_user_name, that can be scrolled until the beginning
+    of times."""
+    user_agent = generate_user_agent(os='linux')
+
+    def __init__(self, user, password):
+        super().__init__()
+        self.user = user
+        self.password = password
+
+    @classmethod
+    def _get_base_page(cls, user_handle):
+        logger.info("Loading login page for {}".format(user_handle))
+        url = "https://twitter.com/login".format(user_handle)
+        return requests.post(
+            url,
+            headers={
+                'User-Agent': cls.user_agent,
+                'Accept-Language': 'en'
+            }
+        )
+
+    def __iter__(self):
+        super().__iter__()
+        base = self._get_base_page(self.user)
+        soup = BeautifulSoup(base.text, "lxml")
+        self._update_state(soup)
+        yield soup
+
+        while self.nb_tweets < self.limit:
+            more = self._scroll()
+            html = json.loads(more.content)
+
+            soup = BeautifulSoup(html['items_html'], "lxml")
+            if not soup.text:
+                logger.info("Latest request provided no explorable content")
+                break
+
+            self._update_state(soup)
+            yield soup
+
+    def __repr__(self):
+        return "<{} (user={}, limit={})>".format(
+            self.__class__.__qualname__,
+            self.user,
+            self.limit
+        )

+ 81 - 0
twhatter/exploration/node/profile.py

@@ -0,0 +1,81 @@
+import logging
+import json
+
+import requests
+from bs4 import BeautifulSoup
+from user_agent import generate_user_agent
+
+from .base import NodeBase
+from twhatter.parser import ParserTweet
+
+logger = logging.getLogger(__name__)
+
+
+class NodeProfile(NodeBase):
+    """Implementation of the "profile" node, which is the page accessed by
+    https://twitter.com/the_user_name, that can be navigated into."""
+    user_agent = generate_user_agent(os='linux')
+
+    def __init__(self, user, limit=100):
+        super().__init__()
+        self.user = user
+        self.earliest_tweet_id = None
+        self.nb_tweets = 0
+        self.limit = limit
+
+    def _update_state(self, soup):
+        tweets = ParserTweet(soup)
+        self.nb_tweets += len(tweets)
+        *_, self.earliest_tweet_id = (t.id for t in tweets)
+        logger.info("{} tweets retrieved so far".format(self.nb_tweets))
+
+    @classmethod
+    def _get_base_page(cls, user_handle):
+        logger.info("Loading base page for {}'s timeline".format(user_handle))
+        url = "https://twitter.com/{}".format(user_handle)
+        return requests.get(
+            url,
+            headers={
+                'User-Agent': cls.user_agent,
+                'Accept-Language': 'en'
+            }
+        )
+
+    def _scroll(self):
+        logger.info("Scrolling in {}'s timeline".format(self.user))
+        return requests.get(
+            "https://twitter.com/{}/followers/users".format(self.user),
+            params= dict(
+                include_available_features=1,
+                include_entities=1,
+                max_position=self.earliest_tweet_id,
+                reset_error_state=False
+            ),
+            headers={'User-Agent': self.user_agent}
+        )
+
+    def __iter__(self):
+        super().__iter__()
+        base = self._get_base_page(self.user)
+        soup = BeautifulSoup(base.text, "lxml")
+        self._update_state(soup)
+        yield soup
+
+        while self.nb_tweets < self.limit:
+            more = self._scroll()
+            html = json.loads(more.content)
+
+            soup = BeautifulSoup(html['items_html'], "lxml")
+            if not soup.text:
+                logger.info("Latest request provided no explorable content")
+                break
+
+            self._update_state(soup)
+            yield soup
+
+    def __repr__(self):
+        return "<{} (user={}, limit={})>".format(
+            self.__class__.__qualname__,
+            self.user,
+            self.limit
+        )

+ 51 - 0
twhatter/exploration/session.py

@@ -0,0 +1,51 @@
+import logging
+import requests
+from bs4 import BeautifulSoup
+from user_agent import generate_user_agent
+
+from .node import NodeLogin
+
+logger = logging.getLogger(__name__)
+
+
+class Session:
+    user_agent = generate_user_agent(os='linux')
+    url = "https://twitter.com/login"
+
+    def __init__(self, user, password):
+        super().__init__()
+        self.user = user
+        self.password = password
+        self.login_page = self._get_page()
+        self.soup = BeautifulSoup(self.login_page.text, "lxml")
+        authenticity_token = self.soup.find("input", attrs={'name': "authenticity_token"}).attrs['value']
+        self.data = {
+            'session[username_or_email]': self.user,
+            'session[password]': self.password,
+            'authenticity_token': authenticity_token
+        }
+        output = self._post_page()
+        soup = BeautifulSoup(output.text, "lxml")
+        print(output)
+
+
+    def _get_page(self):
+        logger.info("Loading login page for {}".format(self.user))
+        return requests.get(
+            self.url,
+            headers={
+                'User-Agent': self.user_agent,
+                'Accept-Language': 'en'
+            }
+        )
+
+    def _post_page(self):
+        logger.info("Loading login page for {}".format(self.user))
+        return requests.post(
+            self.url,
+            data=self.data,
+            headers={
+                'User-Agent': self.user_agent,
+                'Accept-Language': 'en'
+            }
+        )