# I used the TwitterAPI Python package as a convenient wrapper for the API requests
from TwitterAPI import TwitterAPI
import pandas as pd
pd.set_option('max_columns', None)
import time
import json
import matplotlib.pyplot as plt
import numpy as np
import dateutil
import datetime
import matplotlib.dates as mdates
import seaborn as sns
# set up twitter API with Oauth keys. These should be set elsewhere and kept secret!
api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)
resp_hourly = api.request("tweets/search/30day/:dev/counts", {"query": "(urgent OR help) point_radius:[77.232410 28.637459 40km]"})
dfa = pd.DataFrame(resp_hourly.json()['results'])
dfa['timePeriod'] = dfa['timePeriod'].apply(lambda v: pd.to_datetime(v))
resp_daily = api.request("tweets/search/30day/:dev/counts", {"query": "(urgent OR help) point_radius:[77.232410 28.637459 40km]", "bucket": "day"})
dfa2 = pd.DataFrame(resp_daily.json()['results'])
dfa2['timePeriod'] = dfa2['timePeriod'].apply(lambda v: pd.to_datetime(v))
When grouped by day, the hourly data is a close match to the daily data. However, especially for days with low tweet volume, the aggregated hourly data appears to be a slight over-estimate. This is explained by the quantization of low value aggregate data, seen below.
plt.plot(dfa2['timePeriod'], dfa2['count'])
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(dfa.groupby(pd.Grouper(freq='1D', key='timePeriod')).sum()['count'])
[<matplotlib.lines.Line2D at 0x133e10490>]
The counts endpoint appears to quantize/obfuscate any counts between 1 and 5 to a value of 5. Interestingly, this only appears to be the case for georeferenced queries. (Shown further below in this notebook.)
plt.plot(dfa['timePeriod'], dfa['count'])
[<matplotlib.lines.Line2D at 0x133e80a30>]
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()
import twint
# search query with geocode. This is the same as the query used to test the Premium Search API above, but with slightly different syntax
twintq = '(urgent OR help) AND geocode:28.637459,77.232410,40km'
c = twint.Config()
c.Search = twintq
c.Store_csv = True
c.Output = "/Users/loganw/test.csv"
c.Hide_output = True
c.Since = "2021-04-04"
twint.run.Search(c)
[!] No more data! Scraping will stop now. found 0 deleted tweets in this search.
# read TWINT CSV as a Pandas DataFrame
dft = pd.read_csv('/Users/loganw/test.csv', parse_dates=['created_at'])
/Users/loganw/.local/share/virtualenvs/venv-numba-UjqtKfx8/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (9) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
# group the TWINT tweets by hour
grp = dft.groupby(pd.Grouper(freq='1H', key='created_at'))
plt.figure(figsize=(12,4))
plt.plot(grp.count()['id'])
plt.ylabel('Tweets per hour')
Text(0, 0.5, 'Tweets per hour')
plt.figure(figsize=(12,6))
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(grp.count()['id'])
plt.legend(['Premium Search API counts', 'TWINT tweets'])
<matplotlib.legend.Legend at 0x1341e6fa0>
It appears that there are many more tweets returned by TWINT in the previous 7 days than are counted by the Premium Search API. However, beyond this period, they are very similar. (Except for a few cases of more tweets returned by the Premium Search API counts endpoint, which could be a consequence of deleted tweets.)
plt.figure(figsize=(15,6))
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(grp.count()['id'])
plt.ylim([0,30])
plt.legend(['Premium Search API counts', 'TWINT tweets'])
<matplotlib.legend.Legend at 0x14036ec40>
On the lower left of the graph, the quantization of low value counts is very visible.
Use the search endpoint instead of the counts endpoint to get actual tweets.
resp_tweets = api.request("tweets/search/30day/:dev", {"query": "(urgent OR help) point_radius:[77.232410 28.637459 40km]", "maxResults": 500})
api_tweets = pd.DataFrame(resp_tweets.json()['results'])
api_tweets['created_at'] = api_tweets['created_at'].apply(lambda v: pd.to_datetime(v))
c = twint.Config()
c.Search = twintq
c.Store_csv = True
c.Output = "/Users/loganw/twint_tweets_loc.csv"
c.Hide_output = True
c.Since = "2021-05-03"
twint.run.Search(c)
[!] No more data! Scraping will stop now. found 0 deleted tweets in this search.
twint_tweets = pd.read_csv('/Users/loganw/twint_tweets.csv', parse_dates=['created_at'])
/Users/loganw/.local/share/virtualenvs/venv-numba-UjqtKfx8/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (9) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
# a quick check that the tweet datasets contain overlapping time information.
plt.plot(twint_tweets['created_at'])
plt.plot(api_tweets['created_at'])
[<matplotlib.lines.Line2D at 0x13a96e400>]
Most tweets do not have explicit geo information.
api_tweets[api_tweets['geo'].apply(lambda v: v is not None)]
created_at | id | id_str | text | display_text_range | source | truncated | in_reply_to_status_id | in_reply_to_status_id_str | in_reply_to_user_id | in_reply_to_user_id_str | in_reply_to_screen_name | user | geo | coordinates | place | contributors | quoted_status_id | quoted_status_id_str | quoted_status | quoted_status_permalink | is_quote_status | extended_tweet | quote_count | reply_count | retweet_count | favorite_count | entities | favorited | retweeted | possibly_sensitive | filter_level | lang | matching_rules | extended_entities | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
129 | 2021-05-04 08:34:58+00:00 | 1389498775664791554 | 1389498775664791554 | #help #getvaccienated #registration #staysafe ... | NaN | <a href="http://instagram.com" rel="nofollow">... | True | NaN | None | NaN | None | None | {'id': 1085520064017772545, 'id_str': '1085520... | {'type': 'Point', 'coordinates': [28.6410658, ... | {'type': 'Point', 'coordinates': [77.08275231,... | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | {'full_text': '#help #getvaccienated #registra... | 0 | 0 | 0 | 0 | {'hashtags': [{'text': 'help', 'indices': [0, ... | False | False | False | low | no | [{'tag': None}] | NaN |
423 | 2021-05-03 18:37:15+00:00 | 1389287959271198729 | 1389287959271198729 | Need urgent assistance @ Delhi, India https://... | NaN | <a href="http://instagram.com" rel="nofollow">... | False | NaN | None | NaN | None | None | {'id': 936113527, 'id_str': '936113527', 'name... | {'type': 'Point', 'coordinates': [28.631747, 7... | {'type': 'Point', 'coordinates': [77.219672, 2... | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 0 | 0 | 0 | {'hashtags': [], 'urls': [{'url': 'https://t.c... | False | False | False | low | en | [{'tag': None}] | NaN |
However, all seem to contain "place" information.
api_tweets[api_tweets['place'].apply(lambda v: v is not None)]
created_at | id | id_str | text | display_text_range | source | truncated | in_reply_to_status_id | in_reply_to_status_id_str | in_reply_to_user_id | in_reply_to_user_id_str | in_reply_to_screen_name | user | geo | coordinates | place | contributors | quoted_status_id | quoted_status_id_str | quoted_status | quoted_status_permalink | is_quote_status | extended_tweet | quote_count | reply_count | retweet_count | favorite_count | entities | favorited | retweeted | possibly_sensitive | filter_level | lang | matching_rules | extended_entities | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-05-04 12:23:18+00:00 | 1389556236232761345 | 1389556236232761345 | Not required any help at WB they all are crim... | [0, 140] | <a href="http://twitter.com/download/android" ... | True | NaN | None | NaN | None | None | {'id': 1251881467505856517, 'id_str': '1251881... | None | None | {'id': '4d51b7ea67b6f64b', 'url': 'https://api... | None | 1.388856e+18 | 1388855746448695299 | {'created_at': 'Sun May 02 13:59:48 +0000 2021... | {'url': 'https://t.co/ZQMyhFkrz0', 'expanded':... | True | {'full_text': 'Not required any help at WB th... | 0 | 0 | 0 | 0 | {'hashtags': [], 'urls': [{'url': 'https://t.c... | False | False | False | low | en | [{'tag': None}] | NaN |
1 | 2021-05-04 12:22:49+00:00 | 1389556116862816261 | 1389556116862816261 | Who ever can please help!!!\n\n#covid #COVIDEm... | NaN | <a href="http://twitter.com/download/iphone" r... | False | 1.389555e+18 | 1389555145772462080 | 113897359.0 | 113897359 | the__himanish | {'id': 113897359, 'id_str': '113897359', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 0 | 0 | 0 | {'hashtags': [{'text': 'covid', 'indices': [29... | False | False | NaN | low | en | [{'tag': None}] | NaN |
2 | 2021-05-04 12:18:58+00:00 | 1389555145772462080 | 1389555145772462080 | Trying to help in any way I can.\n\n#covid #CO... | [0, 73] | <a href="http://twitter.com/download/iphone" r... | False | NaN | None | NaN | None | None | {'id': 113897359, 'id_str': '113897359', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 1 | 0 | 0 | {'hashtags': [{'text': 'covid', 'indices': [34... | False | False | False | low | en | [{'tag': None}] | {'media': [{'id': 1389555138679820292, 'id_str... |
3 | 2021-05-04 12:17:45+00:00 | 1389554839936397313 | 1389554839936397313 | @sambhavnaseth @Priyajayant1 Hi sambhavana , n... | [29, 120] | <a href="http://twitter.com/download/android" ... | False | 1.389174e+18 | 1389174303833223172 | 170977970.0 | 170977970 | sambhavnaseth | {'id': 748314314499624960, 'id_str': '74831431... | None | None | {'id': '64231a922dea526e', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 0 | 0 | 0 | {'hashtags': [], 'urls': [], 'user_mentions': ... | False | False | NaN | low | en | [{'tag': None}] | NaN |
4 | 2021-05-04 12:16:08+00:00 | 1389554432463937539 | 1389554432463937539 | @narendramodi Sir please help no icu beds avai... | [14, 138] | <a href="http://twitter.com/download/android" ... | False | 1.389240e+18 | 1389239538313089024 | 18839785.0 | 18839785 | narendramodi | {'id': 748314314499624960, 'id_str': '74831431... | None | None | {'id': '64231a922dea526e', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 1 | 0 | 0 | {'hashtags': [], 'urls': [], 'user_mentions': ... | False | False | NaN | low | en | [{'tag': None}] | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | 2021-05-03 16:52:22+00:00 | 1389261563773739021 | 1389261563773739021 | India need #OxygenEmergency \nअभी हम सबको एक द... | NaN | <a href="http://twitter.com/download/android" ... | True | NaN | None | NaN | None | None | {'id': 1389234457207615498, 'id_str': '1389234... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | {'full_text': 'India need #OxygenEmergency अभ... | 0 | 1 | 1 | 1 | {'hashtags': [{'text': 'OxygenEmergency', 'ind... | False | False | NaN | low | hi | [{'tag': None}] | NaN |
496 | 2021-05-03 16:51:18+00:00 | 1389261293979258881 | 1389261293979258881 | Thanks for your support 🙏 AMEZ works by Gugugr... | [0, 140] | <a href="http://twitter.com/download/android" ... | True | NaN | None | NaN | None | None | {'id': 143736875, 'id_str': '143736875', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | 1.389124e+18 | 1389124358182903809 | {'created_at': 'Mon May 03 07:47:10 +0000 2021... | {'url': 'https://t.co/FWv3Rue9yD', 'expanded':... | True | {'full_text': 'Thanks for your support 🙏 AMEZ ... | 0 | 0 | 0 | 0 | {'hashtags': [], 'urls': [{'url': 'https://t.c... | False | False | False | low | en | [{'tag': None}] | NaN |
497 | 2021-05-03 16:49:08+00:00 | 1389260748539400193 | 1389260748539400193 | Delhi CM Shri Kejriwal Ji should immediately g... | [0, 140] | <a href="http://twitter.com/download/android" ... | True | NaN | None | NaN | None | None | {'id': 944587346732752896, 'id_str': '94458734... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | 1.389212e+18 | 1389212209436188673 | {'created_at': 'Mon May 03 13:36:15 +0000 2021... | {'url': 'https://t.co/Aa8Y0k4lRN', 'expanded':... | True | {'full_text': 'Delhi CM Shri Kejriwal Ji shoul... | 0 | 0 | 0 | 0 | {'hashtags': [], 'urls': [{'url': 'https://t.c... | False | False | False | low | en | [{'tag': None}] | NaN |
498 | 2021-05-03 16:47:22+00:00 | 1389260304689729542 | 1389260304689729542 | @rupashreenanda @SanjayAzadSln @AnitaSingh_ @_... | [56, 140] | <a href="http://twitter.com/download/android" ... | True | 1.388905e+18 | 1388905199830396932 | 146751977.0 | 146751977 | rupashreenanda | {'id': 72006548, 'id_str': '72006548', 'name':... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | {'full_text': '@rupashreenanda @SanjayAzadSln ... | 0 | 0 | 0 | 1 | {'hashtags': [], 'urls': [{'url': 'https://t.c... | False | False | NaN | low | en | [{'tag': None}] | NaN |
499 | 2021-05-03 16:45:55+00:00 | 1389259939168743428 | 1389259939168743428 | @Voracious_af Hi brother, u help me alot. Than... | NaN | <a href="http://twitter.com/download/android" ... | False | NaN | None | 117085327.0 | 117085327 | Voracious_af | {'id': 1034416281259335685, 'id_str': '1034416... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | NaN | NaN | NaN | False | NaN | 0 | 1 | 0 | 2 | {'hashtags': [], 'urls': [], 'user_mentions': ... | False | False | NaN | low | en | [{'tag': None}] | NaN |
500 rows × 35 columns
twint_tweets['included'] = twint_tweets['id'].apply(lambda v: v in list(api_tweets['id']))
twint_tweets[np.logical_not(twint_tweets['included'])]
id | conversation_id | created_at | date | time | timezone | user_id | username | name | place | tweet | language | mentions | urls | photos | replies_count | retweets_count | likes_count | hashtags | cashtags | link | retweet | quote_url | video | thumbnail | near | geo | source | user_rt_id | user_rt | retweet_id | reply_to | retweet_date | translate | trans_src | trans_dest | included | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1389567616314855424 | 1389567616314855424 | 2021-05-04 15:08:31+02:00 | 2021-05-04 | 15:08:31 | 200 | 741987952738377728 | anshmishra94 | ANSH MISHRA | NaN | Dear Sir, Kindly help us in this pandemic days... | en | [{'screen_name': 'dmsouthwest1', 'name': 'dm/d... | [] | ['https://pbs.twimg.com/media/E0i8YcjUcAQQVCW.... | 0 | 0 | 0 | [] | [] | https://twitter.com/anshmishra94/status/138956... | False | NaN | 1 | https://pbs.twimg.com/media/E0i8YcjUcAQQVCW.jpg | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
1 | 1389567586208075788 | 1389566912200253441 | 2021-05-04 15:08:24+02:00 | 2021-05-04 | 15:08:24 | 200 | 2302564298 | jeetuoutlook | Jeetu Sharma | NaN | @RubikaLiyaquat 🆘 urgent Require Plasma Bloo... | en | [] | [] | [] | 0 | 0 | 1 | [] | [] | https://twitter.com/JeetuOutlook/status/138956... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'RubikaLiyaquat', 'name': 'Ru... | NaN | NaN | NaN | NaN | False |
2 | 1389567573637795846 | 1385321514279477249 | 2021-05-04 15:08:21+02:00 | 2021-05-04 | 15:08:21 | 200 | 1389541510165635072 | sharmaricha26 | Richa Sharma | NaN | @hiteshsharma_7 Hi I need urgently ventilator ... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/sharmaRicha26/status/13895... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'hiteshsharma_7', 'name': 'Gw... | NaN | NaN | NaN | NaN | False |
3 | 1389567556873166852 | 1389566997038436355 | 2021-05-04 15:08:17+02:00 | 2021-05-04 | 15:08:17 | 200 | 3193112497 | bsesdelhi | BSES Delhi | NaN | @Deen_Aligarian It would be great if you could... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/bsesdelhi/status/138956755... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'Deen_Aligarian', 'name': 'Al... | NaN | NaN | NaN | NaN | False |
4 | 1389567527328509958 | 1389459917078106112 | 2021-05-04 15:08:10+02:00 | 2021-05-04 | 15:08:10 | 200 | 1667354024 | help_delhivery | helpdesk_delhivery | NaN | @Aarav63950265 Kindly allow us some time as th... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/help_delhivery/status/1389... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'Aarav63950265', 'name': 'Aar... | NaN | NaN | NaN | NaN | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17537 | 1389007185074548736 | 1388574853196705794 | 2021-05-03 02:01:34+02:00 | 2021-05-03 | 02:01:34 | 200 | 3959099054 | neerajdegreat | Neeraj Awasthi | NaN | @SonuSood @SoodFoundation Hi @SonuSood one of ... | en | [{'screen_name': 'sonusood', 'name': 'sonu soo... | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/neerajdegreat/status/13890... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'SonuSood', 'name': 'sonu soo... | NaN | NaN | NaN | NaN | False |
17538 | 1389007034570346500 | 1388911082903719938 | 2021-05-03 02:00:58+02:00 | 2021-05-03 | 02:00:58 | 200 | 89141954 | ricky_insomniac | Behosh Item | NaN | @RichardDawkins @RichardDawkins This is side e... | en | [] | [] | [] | 0 | 0 | 1 | [] | [] | https://twitter.com/ricky_insomniac/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'RichardDawkins', 'name': 'Ri... | NaN | NaN | NaN | NaN | False |
17539 | 1389007031567192065 | 1389007031567192065 | 2021-05-03 02:00:57+02:00 | 2021-05-03 | 02:00:57 | 200 | 96164693 | frolicky | KantSurya | NaN | #SahareKaHath #SOS #Plasma #PlasmaDonor #COVID... | und | [] | [] | [] | 0 | 0 | 0 | ['saharekahath', 'sos', 'plasma', 'plasmadonor... | [] | https://twitter.com/frolicky/status/1389007031... | False | https://twitter.com/TheOutspokenBoy/status/138... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
17540 | 1389006971794198529 | 1389006971794198529 | 2021-05-03 02:00:43+02:00 | 2021-05-03 | 02:00:43 | 200 | 837199895022956545 | prajapatiji011 | Kashmir Singh Prajapati | NaN | Even the embassies are seeking help from the O... | en | [] | [] | ['https://pbs.twimg.com/media/E0a-dygVkAIW9g1.... | 0 | 0 | 0 | [] | [] | https://twitter.com/Prajapatiji011/status/1389... | False | NaN | 1 | https://pbs.twimg.com/media/E0a-dygVkAIW9g1.jpg | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
17541 | 1389006918572597252 | 1389006918572597252 | 2021-05-03 02:00:30+02:00 | 2021-05-03 | 02:00:30 | 200 | 96164693 | frolicky | KantSurya | NaN | #SahareKaHath #SOS #Plasma #PlasmaDonor #COVID... | und | [] | [] | [] | 0 | 0 | 0 | ['saharekahath', 'sos', 'plasma', 'plasmadonor... | [] | https://twitter.com/frolicky/status/1389006918... | False | https://twitter.com/honeysinghfb/status/138808... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
17050 rows × 37 columns
Tweets that were found by TWINT and by the Premium Search include a Twitter "place," as in this example: https://twitter.com/SumatiThusoo/status/1389522172671037442
Tweets that were found by TWINT but not by the Premium Search API do not. It appears that information on location is being sourced from the user's profile location in this case.
Unfortunately, very few TWINT tweets contain populated "place" information, even tweets which include a place in the Twitter web interface. It is unknown why this is the case.
twint_tweets[twint_tweets['place'].apply(lambda v: type(v) != float)]
id | conversation_id | created_at | date | time | timezone | user_id | username | name | place | tweet | language | mentions | urls | photos | replies_count | retweets_count | likes_count | hashtags | cashtags | link | retweet | quote_url | video | thumbnail | near | geo | source | user_rt_id | user_rt | retweet_id | reply_to | retweet_date | translate | trans_src | trans_dest | included | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
36 | 1389566757132595204 | 1389566757132595204 | 2021-05-04 15:05:06+02:00 | 2021-05-04 | 15:05:06 | 200 | 149585330 | pairsonnalitesa | Stigmabase | AS | {'type': 'Point', 'coordinates': [28.5974128, ... | INNR — My parents do not know where Harvard is... | en | [] | ['http://dlvr.it/Rz1np3'] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/PairsonnalitesA/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
2725 | 1389498775664791554 | 1389498775664791554 | 2021-05-04 10:34:58+02:00 | 2021-05-04 | 10:34:58 | 200 | 1085520064017772545 | sagarmotors | Sagar Motors | {'type': 'Point', 'coordinates': [28.6410658, ... | #help #getvaccienated #registration #staysafe ... | no | [{'screen_name': 'surajkalra', 'name': 'suraj ... | ['https://www.instagram.com/p/COccvurAt97/?igs... | [] | 0 | 0 | 0 | ['help', 'getvaccienated', 'registration', 'st... | [] | https://twitter.com/SagarMotors/status/1389498... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | True |
7093 | 1389287959271198729 | 1389287959271198729 | 2021-05-03 20:37:15+02:00 | 2021-05-03 | 20:37:15 | 200 | 936113527 | j69293418 | Brown boy | {'type': 'Point', 'coordinates': [28.631747, 7... | Need urgent assistance @ Delhi, India https:/... | en | [] | ['https://www.instagram.com/p/COa82wzFTHW6UqpI... | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/j69293418/status/138928795... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | True |
Unfortunately, the TWINT "Location" option is currently non-functional. This means that it is not possible to distinguish the tweets that only match based on the user's profile location.
twint_tweets[twint_tweets['geo'].apply(lambda v: type(v) != float)]
id | conversation_id | created_at | date | time | timezone | user_id | username | name | place | tweet | language | mentions | urls | photos | replies_count | retweets_count | likes_count | hashtags | cashtags | link | retweet | quote_url | video | thumbnail | near | geo | source | user_rt_id | user_rt | retweet_id | reply_to | retweet_date | translate | trans_src | trans_dest | included |
---|
From these observations we can conclude that the difference in tweet volume is caused by the Twitter Advanced Search (and therefore TWINT as well) including tweets based on the user's profile location information, but only for the most recent seven days. This means that tweet volume measured using TWINT from the most recent seven days cannot be compared with volume beyond that time range.
tweets = []
# the max_id parameter lets us request additional pages of tweets. We will start with
# this set to false to indicate that we are requesting the first page
maxid = False
# make up to 2000 requests
for i in range(2000):
print(i)
if maxid:
resp = api.request("search/tweets", {"q": twintq, "result_type": "recent", "count": 100, "tweet_mode": "extended", "max_id": maxid})
else:
resp = api.request("search/tweets", {"q": twintq, "result_type": "recent", "count": 100, "tweet_mode": "extended"})
response = resp.json()
# we might get rate limited, which resets after 15 minutes
if 'errors' in response and response['errors'][0]['code'] == 88:
print("rate limit error, waiting 15 minutes")
time.sleep(15*60)
elif 'statuses' in response:
statuses = response['statuses']
# print the time of the last status in this set, for monitoring purposes
print(statuses[-1]['created_at'])
# add statuses to tweet list
tweets += statuses
if statuses[-1]['id'] == maxid:
break
maxid = statuses[-1]['id']
else:
print("other error")
print(response)
time.sleep(60)
search_api_tweets = pd.DataFrame(tweets)
# exclude retweets
search_api_norts = search_api_tweets[search_api_tweets['retweeted_status'].apply(lambda v: type(v) is float)]
search_api_norts['created_at'] = search_api_norts['created_at'].apply(lambda v: pd.to_datetime(v))
plt.figure(figsize=(10,6))
plt.plot(twint_tweets.groupby(pd.Grouper(key='created_at', freq='1H')).count()['id'])
plt.plot(search_api_norts.groupby(pd.Grouper(key='created_at', freq='1H')).count()['id'])
plt.plot(api_tweets.groupby(pd.Grouper(key='created_at', freq='1H')).count()['id'])
plt.plot(dfa['timePeriod'], dfa['count'])
plt.legend(['TWINT (Web Advanced Search)', 'Standard API Search', 'Premium API Search', 'Premium API Counts'])
plt.xlim(twint_tweets.groupby(pd.Grouper(key='created_at', freq='1H')).count().index[0], twint_tweets.groupby(pd.Grouper(key='created_at', freq='1H')).count().index[-2])
(18750.0, 18751.5)
The standard Twitter Search API appears to match the behavior of TWINT, however it returns slightly fewer tweets. It is not immediately clear why certain tweets are not included.
twint_subset = twint_tweets[twint_tweets['created_at'].apply(lambda v: v > pd.to_datetime('2021-05-03 12:00:00Z') and v < pd.to_datetime('2021-05-03 20:00:00Z'))]
twint_subset[twint_subset['id'].apply(lambda v: v not in list(search_api_norts['id']))]
id | conversation_id | created_at | date | time | timezone | user_id | username | name | place | tweet | language | mentions | urls | photos | replies_count | retweets_count | likes_count | hashtags | cashtags | link | retweet | quote_url | video | thumbnail | near | geo | source | user_rt_id | user_rt | retweet_id | reply_to | retweet_date | translate | trans_src | trans_dest | included | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6547 | 1389308659356737537 | 1389308659356737537 | 2021-05-03 21:59:31+02:00 | 2021-05-03 | 21:59:31 | 200 | 2433805362 | danceofminds | Sourav Mitra | NaN | #PSF 4may 1.23am 1. For not caring4elders &am... | en | [] | [] | [] | 0 | 0 | 0 | ['psf'] | [] | https://twitter.com/danceofminds/status/138930... | False | https://twitter.com/danceofminds/status/138930... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
6548 | 1389308642416107525 | 1389308642416107525 | 2021-05-03 21:59:27+02:00 | 2021-05-03 | 21:59:27 | 200 | 388956460 | sendmanishkumar | Manish kumar | NaN | Urgent need for 20 million vaccines to cover s... | en | [] | ['https://answersadda.com/pressing-want-for-20... | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/sendmanishkumar/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
6559 | 1389308063375527942 | 1389308063375527942 | 2021-05-03 21:57:09+02:00 | 2021-05-03 | 21:57:09 | 200 | 889705398076186624 | devende17081984 | Devender Rawat | NaN | @fortis_hospital @PMOIndia @mlkhattar @MoHFW_I... | en | [{'screen_name': 'pmoindia', 'name': 'pmo indi... | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/Devende17081984/status/138... | False | https://twitter.com/Devende17081984/status/138... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | True |
6567 | 1389307634151411712 | 1389307634151411712 | 2021-05-03 21:55:26+02:00 | 2021-05-03 | 21:55:26 | 200 | 4632724398 | puneet23011984 | Puneet Bhatia | NaN | @DC_Faridabad @yashpalmurar Dear Yashpal Ji, ... | en | [{'screen_name': 'yashpalmurar', 'name': 'yada... | [] | [] | 1 | 0 | 1 | [] | [] | https://twitter.com/Puneet23011984/status/1389... | False | https://twitter.com/enviara/status/13888206223... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
6580 | 1389307069866536960 | 1389307069866536960 | 2021-05-03 21:53:12+02:00 | 2021-05-03 | 21:53:12 | 200 | 122646612 | mslittlelawyer | Mansi Batra | NaN | Need non covid icu bed in Delhi URGENT @SonuSo... | en | [{'screen_name': 'sonusood', 'name': 'sonu soo... | [] | [] | 1 | 0 | 0 | [] | [] | https://twitter.com/MsLittleLawyer/status/1389... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11102 | 1389190114471911431 | 1389190114471911431 | 2021-05-03 14:08:27+02:00 | 2021-05-03 | 14:08:27 | 200 | 1256587499956244480 | pramilak25 | Pramila | NaN | @RubikaLiyaquat everyone is talking for covid ... | en | [] | ['http://Education.Today'] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/pramilak25/status/13891901... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11106 | 1389190033949839366 | 1389190033949839366 | 2021-05-03 14:08:08+02:00 | 2021-05-03 | 14:08:08 | 200 | 284859592 | 9050898887 | Amit joon | NaN | @anilvijminister sir namaskar!! I am from vpo ... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/9050898887/status/13891900... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11122 | 1389189609892900865 | 1389189609892900865 | 2021-05-03 14:06:27+02:00 | 2021-05-03 | 14:06:27 | 200 | 368303392 | ravish2808 | Ravish Sinha | NaN | @taapsee @drharshvardhan @CovidIndiaSeva need... | en | [{'screen_name': 'drharshvardhan', 'name': 'dr... | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/Ravish2808/status/13891896... | False | https://twitter.com/Ravish2808/status/13891885... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11138 | 1389189191691472897 | 1389189191691472897 | 2021-05-03 14:04:47+02:00 | 2021-05-03 | 14:04:47 | 200 | 135044487 | arundeepak1987 | Arun Kumar Sharma | NaN | @ArvindKejriwal @msisodia Sir I have tried fi... | en | [{'screen_name': 'msisodia', 'name': 'manish s... | [] | ['https://pbs.twimg.com/media/E0dkNCfVIAAnS9Q.... | 0 | 0 | 0 | [] | [] | https://twitter.com/arundeepak1987/status/1389... | False | NaN | 1 | https://pbs.twimg.com/media/E0dkNCfVIAAnS9Q.jpg | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11143 | 1389189001538506755 | 1389189001538506755 | 2021-05-03 14:04:02+02:00 | 2021-05-03 | 14:04:02 | 200 | 1379403403042807809 | amniraghav | Amrita Singh | NaN | @SonuSood sir plz help me for a bed with oxgen... | en | [{'screen_name': 'dmgbnagar', 'name': 'dm g.b.... | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/Amniraghav/status/13891890... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
356 rows × 37 columns
twint_subset[twint_subset['id'].apply(lambda v: v in list(search_api_norts['id']))]
id | conversation_id | created_at | date | time | timezone | user_id | username | name | place | tweet | language | mentions | urls | photos | replies_count | retweets_count | likes_count | hashtags | cashtags | link | retweet | quote_url | video | thumbnail | near | geo | source | user_rt_id | user_rt | retweet_id | reply_to | retweet_date | translate | trans_src | trans_dest | included | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6549 | 1389308559804882944 | 1385207890470850565 | 2021-05-03 21:59:07+02:00 | 2021-05-03 | 21:59:07 | 200 | 331642738 | syedanwarkaifee | syedanwarkaifee | NaN | @DCPNWestDelhi @DelhiPolice @CPDelhi @SanjaySi... | en | [{'screen_name': 'qazibisma_ips', 'name': 'qaz... | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/SyedanwarKaifee/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'DCPNWestDelhi', 'name': 'DCP... | NaN | NaN | NaN | NaN | False |
6550 | 1389308347535396864 | 1389308347535396864 | 2021-05-03 21:58:16+02:00 | 2021-05-03 | 21:58:16 | 200 | 1062742964 | rubikaliyaquat | Rubika Liyaquat | NaN | अभी अभी एक मैसेज मिला.. आख़िरी लाईन ने परेशान ... | hi | [{'screen_name': 'mryaduvansh', 'name': 'subha... | [] | ['https://pbs.twimg.com/media/E0fQk6vVkAMPw-7.... | 73 | 524 | 1765 | [] | [] | https://twitter.com/RubikaLiyaquat/status/1389... | False | NaN | 1 | https://pbs.twimg.com/media/E0fQk6vVkAMPw-7.jpg | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
6551 | 1389308300181655552 | 1389299512708009987 | 2021-05-03 21:58:05+02:00 | 2021-05-03 | 21:58:05 | 200 | 1100355962 | pihushikha | Shikha Singh | NaN | @SonuSood Patient name: Swapnil jaiswal Age : ... | en | [] | [] | [] | 3 | 0 | 3 | [] | [] | https://twitter.com/PihuShikha/status/13893083... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'SonuSood', 'name': 'sonu soo... | NaN | NaN | NaN | NaN | False |
6552 | 1389308280363651074 | 1389308280363651074 | 2021-05-03 21:58:00+02:00 | 2021-05-03 | 21:58:00 | 200 | 3056317147 | shoaibch1996 | Shoaib Chaudhary | NaN | #Need_help_guys Please help, A physically cha... | en | [{'screen_name': 'sushant_says', 'name': 'सुशा... | [] | [] | 0 | 1 | 2 | ['need_help_guys'] | [] | https://twitter.com/ShoaibCh1996/status/138930... | False | https://twitter.com/khushi9595/status/13892744... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
6553 | 1389308256288395266 | 1389308256288395266 | 2021-05-03 21:57:55+02:00 | 2021-05-03 | 21:57:55 | 200 | 118164196 | bipinmisra | BIPIN | NaN | @PMOIndia this can be critical help | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/bipinmisra/status/13893082... | False | https://twitter.com/manugrover73/status/138925... | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
11173 | 1389188250397396995 | 1389188250397396995 | 2021-05-03 14:01:03+02:00 | 2021-05-03 | 14:01:03 | 200 | 989199838716841991 | naveent05233625 | Naveen Tanwar | NaN | @dr_plasmabank Darshan kapoor PSRI HOSPITAL s... | en | [{'screen_name': 'srinivasiyc', 'name': 'srini... | [] | [] | 1 | 1 | 1 | ['plasmadonor', 'plasmadonors', 'plasmadonors'] | [] | https://twitter.com/NaveenT05233625/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11174 | 1389188114191634434 | 1389181135477870593 | 2021-05-03 14:00:30+02:00 | 2021-05-03 | 14:00:30 | 200 | 180740221 | policybazaar | Policybazaar | NaN | @manish_nachnani Manish, we have asked our tea... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/policybazaar/status/138918... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'manish_nachnani', 'name': 'M... | NaN | NaN | NaN | NaN | False |
11175 | 1389188036924116992 | 1389174672978153476 | 2021-05-03 14:00:12+02:00 | 2021-05-03 | 14:00:12 | 200 | 989199838716841991 | naveent05233625 | Naveen Tanwar | NaN | @sucherita_k @CharuPragya @rohit_chahal @Haris... | en | [{'screen_name': 'srinivasiyc', 'name': 'srini... | [] | [] | 1 | 1 | 1 | ['plasmadonor', 'plasmadonors', 'plasmadonors'] | [] | https://twitter.com/NaveenT05233625/status/138... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'sucherita_k', 'name': 'Suche... | NaN | NaN | NaN | NaN | False |
11176 | 1389188032109027329 | 1389188032109027329 | 2021-05-03 14:00:11+02:00 | 2021-05-03 | 14:00:11 | 200 | 1256992875759296512 | garimayadav111 | Garima Yadav | NaN | #Urgent Blood group: O+ Plasma required (mus... | en | [{'screen_name': 'arjunbhatigolf', 'name': 'ar... | [] | [] | 0 | 2 | 1 | ['urgent'] | [] | https://twitter.com/Garimayadav111/status/1389... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] | NaN | NaN | NaN | NaN | False |
11177 | 1389188002518163460 | 1386507720874815490 | 2021-05-03 14:00:04+02:00 | 2021-05-03 | 14:00:04 | 200 | 149462081 | sumitnath75 | Sumit Nath | NaN | @mohammadasimnod @pushpendrakum @smartviewai @... | en | [] | [] | [] | 0 | 0 | 0 | [] | [] | https://twitter.com/sumitnath75/status/1389188... | False | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [{'screen_name': 'mohammadasimnod', 'name': 'm... | NaN | NaN | NaN | NaN | False |
4275 rows × 37 columns
search_api_norts[search_api_norts['place'].apply(lambda v: v is not None)]
created_at | id | id_str | full_text | truncated | display_text_range | entities | metadata | source | in_reply_to_status_id | in_reply_to_status_id_str | in_reply_to_user_id | in_reply_to_user_id_str | in_reply_to_screen_name | user | geo | coordinates | place | contributors | retweeted_status | is_quote_status | retweet_count | favorite_count | favorited | retweeted | lang | quoted_status_id | quoted_status_id_str | quoted_status | possibly_sensitive | extended_entities | withheld_in_countries | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 2021-05-05 12:49:08+00:00 | 1389925127937150979 | 1389925127937150979 | Dear friends. Very urgent. Please help https:/... | False | [0, 38] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 1368507231759663105, 'id_str': '1368507... | None | None | {'id': '2e6064382c71b343', 'url': 'https://api... | None | NaN | True | 0 | 0 | False | False | en | 1.389917e+18 | 1389917435600408581 | {'created_at': 'Wed May 05 12:18:34 +0000 2021... | False | NaN | NaN |
20 | 2021-05-05 12:48:49+00:00 | 1389925047033204741 | 1389925047033204741 | 24/7 Technical Support... Work from Home 🏠🏠\nS... | False | [0, 210] | {'hashtags': [{'text': 'training', 'indices': ... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 4706758688, 'id_str': '4706758688', 'na... | None | None | {'id': '64231a922dea526e', 'url': 'https://api... | None | NaN | False | 0 | 0 | False | False | en | NaN | NaN | NaN | False | NaN | NaN |
133 | 2021-05-05 12:45:59+00:00 | 1389924332495724548 | 1389924332495724548 | @artistprabhkar_ @srinivasiyc @LambaAlka @DrKu... | False | [166, 248] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | 1.389869e+18 | 1389869104912240640 | 1.385550e+18 | 1385549855230205955 | artistprabhkar_ | {'id': 1028169777192357888, 'id_str': '1028169... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | False | 0 | 0 | False | False | en | NaN | NaN | NaN | NaN | NaN | NaN |
211 | 2021-05-05 12:44:09+00:00 | 1389923874213556230 | 1389923874213556230 | @drrajeev4uAIIMS @narendramodi @PMOIndia @AIIM... | False | [167, 449] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | 1.389923e+18 | 1389923194685988868 | 1.681366e+07 | 16813656 | drrajeev4uAIIMS | {'id': 306929475, 'id_str': '306929475', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | False | 3 | 1 | False | False | en | NaN | NaN | NaN | NaN | NaN | NaN |
298 | 2021-05-05 12:41:46+00:00 | 1389923273798873091 | 1389923273798873091 | #CoronaVaccine government should make a commit... | False | [0, 206] | {'hashtags': [{'text': 'CoronaVaccine', 'indic... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 2325281942, 'id_str': '2325281942', 'na... | None | None | {'id': '2281169281a9d3a2', 'url': 'https://api... | None | NaN | False | 0 | 0 | False | False | en | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
138001 | 2021-05-03 06:32:20+00:00 | 1389105528022835202 | 1389105528022835202 | An urgent requirement of oxygen in for a patie... | False | [0, 187] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/iphone" r... | NaN | None | NaN | None | None | {'id': 3306621036, 'id_str': '3306621036', 'na... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | False | 7 | 4 | False | False | en | NaN | NaN | NaN | NaN | NaN | NaN |
138328 | 2021-05-03 06:28:54+00:00 | 1389104662486274049 | 1389104662486274049 | Please help #Verified \n@AdityaRajKaul @Suparn... | False | [0, 131] | {'hashtags': [{'text': 'Verified', 'indices': ... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 120702930, 'id_str': '120702930', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | True | 2 | 1 | False | False | en | 1.389097e+18 | 1389096527835013121 | {'created_at': 'Mon May 03 05:56:35 +0000 2021... | False | NaN | NaN |
138341 | 2021-05-03 06:28:43+00:00 | 1389104616642473986 | 1389104616642473986 | Plz. Help https://t.co/Y82ki3MGGm | False | [0, 9] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 70360790, 'id_str': '70360790', 'name':... | None | None | {'id': '472313ed153fe2bf', 'url': 'https://api... | None | NaN | True | 1 | 2 | False | False | en | 1.389083e+18 | 1389082615596998659 | {'created_at': 'Mon May 03 05:01:18 +0000 2021... | False | NaN | NaN |
138357 | 2021-05-03 06:28:29+00:00 | 1389104557259583492 | 1389104557259583492 | Plz help @rohit_chahal ji https://t.co/PaWvWpzlSZ | False | [0, 25] | {'hashtags': [], 'symbols': [], 'user_mentions... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 1540540308, 'id_str': '1540540308', 'na... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | True | 9 | 18 | False | False | en | 1.389104e+18 | 1389103970094698496 | {'created_at': 'Mon May 03 06:26:09 +0000 2021... | False | NaN | NaN |
138462 | 2021-05-03 06:27:11+00:00 | 1389104231685128195 | 1389104231685128195 | Please RT/Help\n\n@AdityaRajKaul @OmarAbdullah... | False | [0, 98] | {'hashtags': [{'text': 'SOSDelhi', 'indices': ... | {'iso_language_code': 'en', 'result_type': 're... | <a href="http://twitter.com/download/android" ... | NaN | None | NaN | None | None | {'id': 120702930, 'id_str': '120702930', 'name... | None | None | {'id': '317fcc4b21a604d5', 'url': 'https://api... | None | NaN | True | 0 | 1 | False | False | en | 1.389102e+18 | 1389102188245385220 | {'created_at': 'Mon May 03 06:19:04 +0000 2021... | False | NaN | NaN |
1536 rows × 32 columns
Since the data returned by the standard Twitter Search API contains populated place information, it would be possible to merge this with TWINT data to form a consistent dataset of tweets containing place data within a specified radius, while only using free APIs. However, the Twitter Search API does have use rate limitations, and this would include only a limited selection of tweets.
Another limitation of both the Premium Search API and the standard Twitter Search API is the limitation of the radius to 40km maximum. There is no (observed) maximum radius when searching with Twitter Advanced Search.
# Get counts from the counts endpoint, including retweets
resp_normal = api.request("tweets/search/30day/:dev/counts", {"query": "urgent oxygen"})
api_normal = pd.DataFrame(resp_normal.json()['results'])
api_normal['timePeriod'] = api_normal['timePeriod'].apply(lambda v: pd.to_datetime(v))
# Get counts from the counts endpoint, excluding retweets
resp_normal_nrt = api.request("tweets/search/30day/:dev/counts", {"query": "urgent oxygen -is:retweet"})
api_normal = pd.DataFrame(resp_normal_nrt.json()['results'])
api_normal['timePeriod'] = api_normal['timePeriod'].apply(lambda v: pd.to_datetime(v))
c = twint.Config()
c.Search = 'urgent oxygen'
c.Store_csv = True
c.Output = "/Users/loganw/urgent_oxygen.csv"
c.Hide_output = True
c.Since = "2021-04-04"
twint.run.Search(c)
[!] No more data! Scraping will stop now. found 0 deleted tweets in this search.
twint_normal = pd.read_csv('/Users/loganw/urgent_oxygen.csv', parse_dates=['created_at'])
/Users/loganw/.local/share/virtualenvs/venv-numba-UjqtKfx8/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (9) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
twint_normal_grp = twint_normal.groupby(pd.Grouper(freq='1H', key='created_at')).count()
plt.figure(figsize=(10,6))
plt.plot(twint_normal_grp['id'])
plt.plot(api_normal['timePeriod'], api_normal['count'])
plt.legend(['Volume of tweets on TWINT', 'Volume of tweets counted by Twitter Premium Search API'])
<matplotlib.legend.Legend at 0x1349fca00>
It appears that they match quite closely, with the Premium Search API counting slightly more tweets in certain circumstances. This is expected from the official Twitter documentation.
# save session
import dill
dill.dump_session('india-twitter-analysis.pkl')
# restore session
import dill
dill.load_session('india-twitter-analysis.pkl')