# I used the TwitterAPI Python package as a convenient wrapper for the API requests
from TwitterAPI import TwitterAPI
import pandas as pd
pd.set_option('max_columns', None)
import time
import json
import matplotlib.pyplot as plt
import numpy as np
import dateutil
import datetime
import matplotlib.dates as mdates
import seaborn as sns
# set up twitter API with Oauth keys. These should be set elsewhere and kept secret!
api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)
resp_hourly = api.request("tweets/search/30day/:dev/counts", {"query": "(urgent OR help) point_radius:[77.232410 28.637459 40km]"})
dfa = pd.DataFrame(resp_hourly.json()['results'])
dfa['timePeriod'] = dfa['timePeriod'].apply(lambda v: pd.to_datetime(v))
resp_daily = api.request("tweets/search/30day/:dev/counts", {"query": "(urgent OR help) point_radius:[77.232410 28.637459 40km]", "bucket": "day"})
dfa2 = pd.DataFrame(resp_daily.json()['results'])
dfa2['timePeriod'] = dfa2['timePeriod'].apply(lambda v: pd.to_datetime(v))
When grouped by day, the hourly data is a close match to the daily data. However, especially for days with low tweet volume, the aggregated hourly data appears to be a slight over-estimate. This is explained by the quantization of low value aggregate data, seen below.
plt.plot(dfa2['timePeriod'], dfa2['count'])
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(dfa.groupby(pd.Grouper(freq='1D', key='timePeriod')).sum()['count'])
[<matplotlib.lines.Line2D at 0x133e10490>]
The counts endpoint appears to quantize/obfuscate any counts between 1 and 5 to a value of 5. Interestingly, this only appears to be the case for georeferenced queries. (Shown further below in this notebook.)
plt.plot(dfa['timePeriod'], dfa['count'])
[<matplotlib.lines.Line2D at 0x133e80a30>]
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()
import twint
# search query with geocode. This is the same as the query used to test the Premium Search API above, but with slightly different syntax
twintq = '(urgent OR help) AND geocode:28.637459,77.232410,40km'
c = twint.Config()
c.Search = twintq
c.Store_csv = True
c.Output = "/Users/loganw/test.csv"
c.Hide_output = True
c.Since = "2021-04-04"
twint.run.Search(c)
[!] No more data! Scraping will stop now. found 0 deleted tweets in this search.
# read TWINT CSV as a Pandas DataFrame
dft = pd.read_csv('/Users/loganw/test.csv', parse_dates=['created_at'])
/Users/loganw/.local/share/virtualenvs/venv-numba-UjqtKfx8/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (9) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
# group the TWINT tweets by hour
grp = dft.groupby(pd.Grouper(freq='1H', key='created_at'))
plt.figure(figsize=(12,4))
plt.plot(grp.count()['id'])
plt.ylabel('Tweets per hour')
Text(0, 0.5, 'Tweets per hour')
plt.figure(figsize=(12,6))
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(grp.count()['id'])
plt.legend(['Premium Search API counts', 'TWINT tweets'])
<matplotlib.legend.Legend at 0x1341e6fa0>
It appears that there are many more tweets returned by TWINT in the previous 7 days than are counted by the Premium Search API. However, beyond this period, they are very similar. (Except for a few cases of more tweets returned by the Premium Search API counts endpoint, which could be a consequence of deleted tweets.)
plt.figure(figsize=(15,6))
plt.plot(dfa['timePeriod'], dfa['count'])
plt.plot(grp.count()['id'])
plt.ylim([0,30])
plt.legend(['Premium Search API counts', 'TWINT tweets'])
<matplotlib.legend.Legend at 0x14036ec40>