theenglishway (time) 7 лет назад
Родитель
Сommit
ae3874ced7
5 измененных файлов с 64 добавлено и 288 удалено
  1. 6 6
      twhatter/__main__.py
  2. 50 18
      twhatter/api.py
  3. 5 8
      twhatter/cli.py
  4. 0 256
      twhatter/old_query.py
  5. 3 0
      twhatter/parser/tweet.py

+ 6 - 6
twhatter/__main__.py

@@ -1,13 +1,13 @@
-from twhatter.old_query import query_tweets_from_user
-from twhatter.query import Query
 from twhatter.api import ApiUser
 from twhatter.api import ApiUser
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 from twhatter.parser import TweetList
 from twhatter.parser import TweetList
 from twhatter.output import Print
 from twhatter.output import Print
 
 
+user="the_english_way"
+a = ApiUser(user)
 
 
-q = Query(ApiUser("the_english_way").init_page)
-soup = BeautifulSoup(q.text, "lxml")
-t_list = TweetList(soup)
-for t in t_list:
+for t in a.iter_own_tweets():
+    Print(t)()
+
+for t in a.iter_all_tweets():
     Print(t)()
     Print(t)()

+ 50 - 18
twhatter/api.py

@@ -1,12 +1,12 @@
 import requests
 import requests
 from random import choice
 from random import choice
+from bs4 import BeautifulSoup
 
 
-
-class Api():
-    pass
+from twhatter.parser import TweetList
+import json
 
 
 
 
-class ApiUser(Api):
+class Api():
     HEADERS_LIST = [
     HEADERS_LIST = [
         'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
         'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
         'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
         'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
@@ -15,24 +15,56 @@ class ApiUser(Api):
         'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
         'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
     ]
     ]
 
 
-    def __init__(self, user):
+    def get_initial(self):
+        raise NotImplementedError()
+
+    def get_more_tweets(self):
+        raise NotImplementedError()
+
+
+class ApiUser(Api):
+    def __init__(self, user, limit=100):
         self.user = user
         self.user = user
+        self.earliest_tweet = None
+        self.limit = limit
 
 
-    @property
-    def init_page(self):
+    def get_initial(self):
+        url = "https://twitter.com/{}".format(self.user)
         return requests.get(
         return requests.get(
-            'https://twitter.com/{}'.format(self.user),
+            url,
             headers={'User-Agent': choice(self.HEADERS_LIST)}
             headers={'User-Agent': choice(self.HEADERS_LIST)}
         )
         )
 
 
-    @property
-    def tweets_from(self, position):
-        return (
-            "https://twitter.com/i/profiles/show/{u}"
-            "/timeline"
-            "/tweets"
-            "?include_available_features=1"
-            "&include_entities=1"
-            "&max_position={pos}"
-            "&reset_error_state=false"
+    def get_more_tweets(self):
+        return requests.get(
+            "https://twitter.com/i/profiles/show/{}/timeline/tweets".format(self.user),
+            params= dict(
+                include_available_features=1,
+                include_entities=1,
+                max_position=self.earliest_tweet,
+                reset_error_state=False
+            ),
+            headers={'User-Agent': choice(self.HEADERS_LIST)}
         )
         )
+
+    def iter_tweets(self):
+        tweets = self.get_initial()
+        soup = BeautifulSoup(tweets.text, "lxml")
+        t_list = TweetList(soup)
+
+        for t in t_list:
+            yield t
+            self.earliest_tweet = t.id
+
+        while True:
+            more_tweets = self.get_more_tweets()
+            html = json.loads(more_tweets.content)
+            soup = BeautifulSoup(html['items_html'], "lxml")
+            t_list = TweetList(soup)
+
+            if len(t_list) == 0:
+                break
+
+            for t in t_list:
+                yield t
+                self.earliest_tweet = t.id

+ 5 - 8
twhatter/cli.py

@@ -4,21 +4,18 @@
 import click
 import click
 
 
 from twhatter.api import ApiUser
 from twhatter.api import ApiUser
-from bs4 import BeautifulSoup
-from twhatter.parser import TweetList
 
 
 
 
 @click.command()
 @click.command()
 @click.option('--user', prompt='User name to check',
 @click.option('--user', prompt='User name to check',
               help='The person to greet.')
               help='The person to greet.')
-def main(user):
+@click.option('-r', '--replies', is_flag=True)
+def main(user, replies):
     """Console script for twhatter."""
     """Console script for twhatter."""
-    p = ApiUser(user).init_page
-    soup = BeautifulSoup(p.text, "lxml")
-    t_list = TweetList(soup)
-    for t in t_list:
-        click.echo(t)
+    a = ApiUser(user)
 
 
+    for t in a.iter_tweets():
+        click.echo(t)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     main()
     main()

+ 0 - 256
twhatter/old_query.py

@@ -1,256 +0,0 @@
-from __future__ import division
-import random
-import requests
-import datetime as dt
-import json
-from functools import partial
-from multiprocessing.pool import Pool
-
-from twhatter.parser.tweet import Tweet
-import urllib
-
-import logging
-
-
-logger = logging.getLogger('twitterscraper')
-
-formatter = logging.Formatter('%(levelname)s: %(message)s')
-handler = logging.StreamHandler()
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-level = logging.INFO
-logger.setLevel(level)
-
-
-HEADERS_LIST = [
-    'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
-    'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
-    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
-    'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
-    'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
-]
-
-HEADER = {'User-Agent': random.choice(HEADERS_LIST)}
-
-INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}'
-RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \
-             'default&include_available_features=1&include_entities=1&' \
-             'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}'
-INIT_URL_USER = 'https://twitter.com/{u}'
-RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/{u}/timeline/tweets?' \
-                  'include_available_features=1&include_entities=1&' \
-                  'max_position={pos}&reset_error_state=false'
-
-
-def get_query_url(query, lang, pos, from_user = False):
-    if from_user:
-        if pos is None:
-            return INIT_URL_USER.format(u=query)
-        else:
-            return RELOAD_URL_USER.format(u=query, pos=pos)
-    if pos is None:
-        return INIT_URL.format(q=query, lang=lang)
-    else:
-        return RELOAD_URL.format(q=query, pos=pos, lang=lang)
-
-
-def linspace(start, stop, n):
-    if n == 1:
-        yield stop
-        return
-    h = (stop - start) / (n - 1)
-    for i in range(n):
-        yield start + h * i
-
-
-def query_single_page(query, lang, pos, retry=50, from_user=False):
-    """
-    Returns tweets from the given URL.
-
-    :param query: The query parameter of the query url
-    :param lang: The language parameter of the query url
-    :param pos: The query url parameter that determines where to start looking
-    :param retry: Number of retries if something goes wrong.
-    :return: The list of tweets, the pos argument for getting the next page.
-    """
-    url = get_query_url(query, lang, pos, from_user)
-
-    try:
-        response = requests.get(url, headers=HEADER)
-        if pos is None:  # html response
-            html = response.text or ''
-            json_resp = None
-        else:
-            html = ''
-            try:
-                json_resp = json.loads(response.text)
-                html = json_resp['items_html'] or ''
-            except ValueError as e:
-                logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))
-
-        tweets = list(Tweet.from_html(html))
-
-        if not tweets:
-            if json_resp:
-                pos = json_resp['min_position']
-            else:
-                pos = None
-            if retry > 0:
-                return query_single_page(query, lang, pos, retry - 1, from_user)
-            else:
-                return [], pos
-
-        if json_resp:
-            return tweets, urllib.parse.quote(json_resp['min_position'])
-        if from_user:
-            return tweets, tweets[-1].id
-        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
-
-    except requests.exceptions.HTTPError as e:
-        logger.exception('HTTPError {} while requesting "{}"'.format(
-            e, url))
-    except requests.exceptions.ConnectionError as e:
-        logger.exception('ConnectionError {} while requesting "{}"'.format(
-            e, url))
-    except requests.exceptions.Timeout as e:
-        logger.exception('TimeOut {} while requesting "{}"'.format(
-            e, url))
-    except json.decoder.JSONDecodeError as e:
-        logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
-            e, url))
-
-    if retry > 0:
-        logger.info('Retrying... (Attempts left: {})'.format(retry))
-        return query_single_page(query, lang, pos, retry - 1)
-
-    logger.error('Giving up.')
-    return [], None
-
-
-def query_tweets_once_generator(query, limit=None, lang='', pos=None):
-    """
-    Queries twitter for all the tweets you want! It will load all pages it gets
-    from twitter. However, twitter might out of a sudden stop serving new pages,
-    in that case, use the `query_tweets` method.
-
-    Note that this function catches the KeyboardInterrupt so it can return
-    tweets on incomplete queries if the user decides to abort.
-
-    :param query: Any advanced query you want to do! Compile it at
-                  https://twitter.com/search-advanced and just copy the query!
-    :param limit: Scraping will be stopped when at least ``limit`` number of
-                  items are fetched.
-    :param pos: Field used as a "checkpoint" to continue where you left off in iteration
-    :return:      A list of twitterscraper.Tweet objects. You will get at least
-                  ``limit`` number of items.
-    """
-    logger.info('Querying {}'.format(query))
-    query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
-    num_tweets = 0
-    try:
-        while True:
-            new_tweets, new_pos = query_single_page(query, lang, pos)
-            if len(new_tweets) == 0:
-                logger.info('Got {} tweets for {}.'.format(
-                    num_tweets, query))
-                return
-
-            for t in new_tweets:
-                yield t, pos
-
-            # use new_pos only once you have iterated through all old tweets
-            pos = new_pos
-
-            num_tweets += len(new_tweets)
-
-            if limit and num_tweets >= limit:
-                logger.info('Got {} tweets for {}.'.format(
-                    num_tweets, query))
-                return
-
-    except KeyboardInterrupt:
-        logger.info('Program interrupted by user. Returning tweets gathered '
-                     'so far...')
-    except BaseException:
-        logger.exception('An unknown error occurred! Returning tweets '
-                          'gathered so far.')
-    logger.info('Got {} tweets for {}.'.format(
-        num_tweets, query))
-
-
-def query_tweets_once(*args, **kwargs):
-    res = list(query_tweets_once_generator(*args, **kwargs))
-    if res:
-        tweets, positions = zip(*res)
-        return tweets
-    else:
-        return []
-
-
-def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''):
-    no_days = (enddate - begindate).days
-
-    if(no_days < 0):
-        sys.exit('Begin date must occur before end date.')
-
-    if poolsize > no_days:
-        # Since we are assigning each pool a range of dates to query,
-		# the number of pools should not exceed the number of dates.
-        poolsize = no_days
-    dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]
-
-    if limit:
-        limit_per_pool = (limit // poolsize)+1
-    else:
-        limit_per_pool = None
-
-    queries = ['{} since:{} until:{}'.format(query, since, until)
-               for since, until in zip(dateranges[:-1], dateranges[1:])]
-
-    all_tweets = []
-    try:
-        pool = Pool(poolsize)
-        logger.info('queries: {}'.format(queries))
-        try:
-            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
-                all_tweets.extend(new_tweets)
-                logger.info('Got {} tweets ({} new).'.format(
-                    len(all_tweets), len(new_tweets)))
-        except KeyboardInterrupt:
-            logger.info('Program interrupted by user. Returning all tweets '
-                         'gathered so far.')
-    finally:
-        pool.close()
-        pool.join()
-
-    return all_tweets
-
-
-def query_tweets_from_user(user, limit=None):
-    pos = None
-    tweets = []
-    query = user
-    try:
-        while True:
-           new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True)
-           if len(new_tweets) == 0:
-               logger.info("Got {} tweets from username {}".format(len(tweets), user))
-               return tweets
-
-           tweets += new_tweets
-
-           if limit and len(tweets) >= limit:
-               logger.info("Got {} tweets from username {}".format(len(tweets), user))
-               return tweets
-
-    except KeyboardInterrupt:
-        logger.info("Program interrupted by user. Returning tweets gathered "
-                     "so far...")
-    except BaseException:
-        logger.exception("An unknown error occurred! Returning tweets "
-                          "gathered so far.")
-    logger.info("Got {} tweets from username {}.".format(
-        len(tweets), user))
-    return tweets
-

+ 3 - 0
twhatter/parser/tweet.py

@@ -105,3 +105,6 @@ class TweetList:
     def __iter__(self):
     def __iter__(self):
         for tweet in self.raw_tweets:
         for tweet in self.raw_tweets:
             yield Tweet.extract(tweet)
             yield Tweet.extract(tweet)
+
+    def __len__(self):
+        return len(self.raw_tweets)