import folium
import h3
from shapely.geometry import Polygon, shape
import pandas as pd
import shapely
import numpy as np
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()
import twint
import os
import time


# A GeoJSON object representing a bounding box for the region of interest

geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              68.02734375,
              23.725011735951796
            ],
            [
              70.048828125,
              20.427012814257385
            ],
            [
              72.333984375,
              20.879342971957897
            ],
            [
              72.6416015625,
              18.979025953255267
            ],
            [
              73.36669921875,
              16.024695711685315
            ],
            [
              74.55322265625,
              12.833226023521243
            ],
            [
              76.1572265625,
              9.167178732976677
            ],
            [
              77.32177734375,
              7.863381805309173
            ],
            [
              78.31054687499999,
              8.47237228290914
            ],
            [
              78.44238281249999,
              8.841651120809145
            ],
            [
              79.03564453124999,
              9.188870084473406
            ],
            [
              79.62890625,
              8.86336203355168
            ],
            [
              79.69482421875,
              7.36246686553575
            ],
            [
              79.73876953125,
              6.206090498573885
            ],
            [
              80.15625,
              5.68158368342113
            ],
            [
              81.73828125,
              6.293458760393985
            ],
            [
              82.08984375,
              7.536764322084078
            ],
            [
              81.123046875,
              9.167178732976677
            ],
            [
              80.13427734374999,
              10.120301632173907
            ],
            [
              79.98046875,
              11.673755403433413
            ],
            [
              80.44189453125,
              13.389619591747595
            ],
            [
              80.26611328125,
              15.284185114076433
            ],
            [
              81.10107421874999,
              15.749962572748768
            ],
            [
              82.3974609375,
              16.615137799987075
            ],
            [
              84.92431640625,
              19.103648251663646
            ],
            [
              86.59423828125,
              20.05593126519445
            ],
            [
              87.14355468749999,
              20.73556590521864
            ],
            [
              87.099609375,
              21.3303150734318
            ],
            [
              91.34033203125,
              21.94304553343818
            ],
            [
              92.3291015625,
              21.963424936844223
            ],
            [
              92.92236328125,
              21.94304553343818
            ],
            [
              94.50439453125,
              24.367113562651262
            ],
            [
              95.29541015625,
              26.194876675795218
            ],
            [
              97.3828125,
              27.059125784374068
            ],
            [
              97.6025390625,
              28.304380682962783
            ],
            [
              96.30615234375,
              29.49698759653577
            ],
            [
              94.5703125,
              29.439597566602902
            ],
            [
              91.845703125,
              28.110748760633534
            ],
            [
              89.69238281249999,
              28.420391085674304
            ],
            [
              87.16552734375,
              28.05259082333983
            ],
            [
              85.20996093749999,
              28.69058765425071
            ],
            [
              83.671875,
              29.592565403314087
            ],
            [
              82.2216796875,
              30.41078179084589
            ],
            [
              80.88134765625,
              31.015278981711266
            ],
            [
              79.5849609375,
              32.52828936482526
            ],
            [
              78.15673828125,
              35.37113502280101
            ],
            [
              75.30029296875,
              37.00255267215955
            ],
            [
              72.3779296875,
              36.61552763134925
            ],
            [
              71.25732421875,
              34.50655662164561
            ],
            [
              68.9501953125,
              31.297327991404266
            ],
            [
              65.93994140625,
              27.391278222579277
            ],
            [
              65.72021484375,
              25.46311452925943
            ],
            [
              68.02734375,
              23.725011735951796
            ]
          ]
        ]
      }
    }
  ]
}


# Add a 1 degree buffer around this area

s = shape(geojson['features'][0]['geometry'])
s = s.buffer(1)
feature = {'type': 'Feature', 'properties': {}, 'geometry': shapely.geometry.mapping(s)}
feature['geometry']['coordinates'] = [[[v[0], v[1]] for v in feature['geometry']['coordinates'][0]]]
feature = feature['geometry']
feature['coordinates'][0] = [[v[1], v[0]] for v in feature['coordinates'][0]]


# map H3 hexagons (code taken from H3 example: https://github.com/uber/h3-py-notebooks/blob/master/notebooks/usage.ipynb)

def visualize_hexagons(hexagons, color="red", folium_map=None):
    """
    hexagons is a list of hexcluster. Each hexcluster is a list of hexagons. 
    eg. [[hex1, hex2], [hex3, hex4]]
    """
    polylines = []
    lat = []
    lng = []
    for hex in hexagons:
        polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False)
        # flatten polygons into loops.
        outlines = [loop for polygon in polygons for loop in polygon]
        polyline = [outline + [outline[0]] for outline in outlines][0]
        lat.extend(map(lambda v:v[0],polyline))
        lng.extend(map(lambda v:v[1],polyline))
        polylines.append(polyline)
    
    if folium_map is None:
        m = folium.Map(location=[sum(lat)/len(lat), sum(lng)/len(lng)], zoom_start=13, tiles='cartodbpositron')
    else:
        m = folium_map
    for polyline in polylines:
        my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color=color)
        m.add_child(my_PolyLine)
    return m
    

def visualize_polygon(polyline, color):
    polyline.append(polyline[0])
    lat = [p[0] for p in polyline]
    lng = [p[1] for p in polyline]
    m = folium.Map(location=[sum(lat)/len(lat), sum(lng)/len(lng)], zoom_start=13, tiles='cartodbpositron')
    my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color=color)
    m.add_child(my_PolyLine)
    return m

# find all hexagons with a center that falls within our buffered area of interest from above
polyline = feature['coordinates'][0]
polyline.append(polyline[0])
lat = [p[0] for p in polyline]
lng = [p[1] for p in polyline]
m = folium.Map(location=[sum(lat)/len(lat), sum(lng)/len(lng)], zoom_start=6, tiles='cartodbpositron')
my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color="green")
m.add_child(my_PolyLine)

# make the list of hexagon IDs in our AOI
hexagons = list(h3.polyfill(feature, 3))

# map the hexagons
polylines = []
lat = []
lng = []
for hex in hexagons:
    polygons = h3.h3_set_to_multi_polygon([hex], geo_json=False)
    # flatten polygons into loops.
    outlines = [loop for polygon in polygons for loop in polygon]
    polyline = [outline + [outline[0]] for outline in outlines][0]
    lat.extend(map(lambda v:v[0],polyline))
    lng.extend(map(lambda v:v[1],polyline))
    polylines.append(polyline)
for polyline in polylines:
    my_PolyLine=folium.PolyLine(locations=polyline,weight=8,color='red')
    m.add_child(my_PolyLine)
display(m)


len(hexagons)

452


np.sqrt((h3.cell_area(hexagons[0], unit='km^2')/0.827)/np.pi)

68.15279373567836


for hex in hexagons:
    center = h3.h3_to_geo(hex)
    r = np.sqrt((h3.cell_area(hex, unit='km^2')/0.827)/np.pi) 
    query = "(urgent OR help) AND geocode:" + str(center[0]) + "," + str(center[1]) + "," + str(r) + "km"
    
print("Example query:", query)

Example query: (urgent OR help) AND geocode:22.089682313298564,81.20150766588854,69.85968148546738km


os.mkdir('data_by_hex')


# Download tweets using TWINT for each hexagon in the list
hexagons_left = hexagons

while len(hexagons_left) > 0:
    hex = hexagons_left.pop(0)
    
    if not os.path.exists('data_by_hex/' + hex + '.csv'):
        center = h3.h3_to_geo(hex)
        r = np.sqrt(h3.cell_area(hex, unit='km^2')*1.20919957616/np.pi) 
        query = "(urgent OR help) AND geocode:" + str(center[0]) + "," + str(center[1]) + "," + str(r) + "km"
        print(query)


        c = twint.Config()

        c.Search = query
        c.Store_csv = True
        c.Output = "data_by_hex/" + hex + ".csv"
        c.Hide_output = True
        c.Since = "2021-01-01"

        try:
            twint.run.Search(c)
        except:
            os.remove('data_by_hex/' + hex + '.csv')
            time.sleep(10)
            print('error, removing this one')
            hexagons_left.append(hex)
    
    print(len(hexagons_left), "remaining")


# To merge everything into a single DataFrame, it is helpful to start with the index we want to use ultimately.
# The easiest way to do this was just to use the same aggregation method that will eventually be applied to
# every search query in the dataset, and grab its index.

df = pd.read_csv('data_by_hex/' + '836181fffffffff' + '.csv', parse_dates=['created_at'])
grp = df.groupby(pd.Grouper(freq='1D', key='created_at'))

df = pd.DataFrame()
df['all'] = grp.count()['id']
del df['all']
tdf = df

# now "tdf" is an empty DataFrame that can be used to append columns to


# Next, this aggregation process is repeated for each file

# optionally, merge with a second set of files by specifying another directory here.
merge = False

for hex in hexagons:
    if os.path.exists('data_by_hex/' + hex + '.csv'):
        df = pd.read_csv('data_by_hex/' + hex + '.csv', parse_dates=['created_at'])
        
        if merge:
            df2 = pd.read_csv(merge + hex + '.csv', parse_dates=['created_at'])
            df = df.append(df2)
            df = df.drop_duplicates(subset=['id'])
            df = df.sort_values(by='created_at')
            df.index = range(len(df))
        
        # look for keywords and add as a column
        df['o2'] = df['tweet'].apply(lambda v: 'oxygen' in v.lower() or 'o2' in v.lower())

        try:
            grp = df.groupby(pd.Grouper(freq='1D', key='created_at'))
        except:
            # sometimes this groupby fails with a type issue
            df['created_at'] = pd.to_datetime(df['created_at'], utc=True)
            grp = df.groupby(pd.Grouper(freq='1D', key='created_at'))

        df = pd.DataFrame()

        df['all'] = grp.count()['id']
        df['o2'] = grp.sum()['o2']

        # store this data as columns in our main DataFrame, tdf
        tdf[(hex, 'all')] = df['all']
        tdf[(hex, 'o2')] = df['o2']

# replace NaNs (caused when no tweets were aggregated for a time range) with 0
tdf = tdf.fillna(0)


# make a list of information about each hexagon, including the baseline average tweet volume.

lats = []
lons = []
rs = []
baselines = []

for hex in hexagons:
    r = np.sqrt(h3.cell_area(hex, unit='km^2')*1.20919957616/np.pi) 
    rs.append(r)

    loc = h3.h3_to_geo(hex)
    lats.append(loc[0])
    lons.append(loc[1])
    
    try:
        baseline = tdf[(hex, 'all')].mean()
    except:
        baseline = 0
    baselines.append(baseline)
    
hexes = pd.DataFrame()
hexes['lat'] = lats
hexes['lng'] = lons
hexes['r'] = rs
hexes['baseline'] = baselines
hexes.index = hexagons
hexes


# export both to CSV
tdf.to_csv('daily_weighted.csv')
hexes.to_csv('hexes_weighted.csv')


# Check to see if there were a substantial number of individual twitter users for each of the hex cells

users = {}

for hex in hexagons:
    if os.path.exists('/Users/loganw/Documents/india/' + hex + '.csv'):
        df = pd.read_csv('/Users/loganw/Documents/india/' + hex + '.csv', parse_dates=['created_at'])
        grp = df.groupby('username').count()
        users[hex] = len(grp)


# Plot the distributon of user counts.
plt.hist(users.values(), np.arange(0, 2000, 50))

(array([90., 39., 40., 19., 20., 15., 19.,  9.,  7.,  9.,  5.,  2.,  8.,
         6.,  8.,  6.,  3.,  1.,  4.,  2.,  3.,  3.,  4.,  2.,  0.,  0.,
         1.,  2.,  1.,  1.,  1.,  3.,  0.,  1.,  0.,  0.,  2.,  3.,  1.]),
 array([   0,   50,  100,  150,  200,  250,  300,  350,  400,  450,  500,
         550,  600,  650,  700,  750,  800,  850,  900,  950, 1000, 1050,
        1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600,
        1650, 1700, 1750, 1800, 1850, 1900, 1950]),
 <BarContainer object of 39 artists>)


averages = []
stds = []
weighted = {}

for hex in hexagons:
    df = None
    
    if os.path.exists('path1/' + hex + '.csv') and os.path.exists('path2/' + hex + '.csv'):
        df2 = pd.read_csv('path1/' + hex + '.csv', parse_dates=['created_at'])
        df1 = pd.read_csv('path2/' + hex + '.csv', parse_dates=['created_at'])
        
        df1['created_at'] = pd.to_datetime(df1['created_at'], utc=True)
        df2['created_at'] = pd.to_datetime(df2['created_at'], utc=True)

        grp1 = df1.groupby(pd.Grouper(freq='1D', key='created_at'))
        grp2 = df2.groupby(pd.Grouper(freq='1D', key='created_at'))
        
        df = grp2.count()[['id']]
        df['old'] = grp1.count()['id']
        df = df.fillna(0)
        
        # compute the ratio of tweets captured by the "old" scraping session compared to those captured
        # by the "new" scraping session. For times in the past, this will be 1, and for times in the most
        # recenter 7 days, this will be 1, but in between, it will be larger than 1.
        ratio = np.array(df['old']) / np.array(df['id'])
        df['ratio'] = ratio
        
        # These indices would need to be adjusted for other data so that they contain the days when
        # df2 has a partial set of data, but df1 has a full set of data. For example, if df1 was
        # scraped on the 7th, and df2 was scraped on the 10th, this could include the 1st, 2nd, and 3rd.
        ratios = np.array(df.loc[df.index[-20:-13]]['ratio'])
        ratios = [r for r in ratios if r < 1000]
        
        avg_ratio = np.mean(ratios)
        std_ratio = np.std(ratios)
        weighted_ratio = df.loc[df.index[-20:-13]]['old'].sum() / df.loc[df.index[-20:-13]]['id'].sum()
                
        averages.append(avg_ratio)
        stds.append(std_ratio)
        
        weighted[hex] = weighted_ratio


weights = []

for h in hexes.index:
    if h in weighted and weighted[h] < 1000 and not np.isnan(weighted[h]):
        weights.append(weighted[h])
    else:
        weights.append(np.median([v for v in list(weighted.values()) if v < 1000 and not np.isnan(v)]))
    
# add this to our per-hex dataframe
hexes['weights'] = weights


np.median([v for v in list(weighted.values()) if v < 1000 and not np.isnan(v)])

8.206185567010309


plt.errorbar(range(222), [a for a in averages if a < 10000 and not np.isnan(a)], yerr=[a for a in stds if a < 10000 and not np.isnan(a)])

<ErrorbarContainer object of 3 artists>

	lat	lng	r	baseline	weights
8342e9fffffffff	25.604755	72.536284	68.800914	2.200000	2.060976
8342f3fffffffff	21.717278	68.237880	65.871140	0.000000	8.206186
833d8dfffffffff	26.995700	81.311909	68.723951	30.906667	6.742437
833c08fffffffff	28.506879	89.302672	69.306116	0.000000	8.206186
836092fffffffff	21.992150	78.158324	68.380089	0.440000	5.526316
...	...	...	...	...	...
833c00fffffffff	28.508304	86.005090	68.652271	0.000000	8.206186
8360b0fffffffff	18.439594	79.364650	70.140780	17.426667	6.504505
8360aafffffffff	16.895601	75.166881	68.040215	0.813333	5.636364
8342c3fffffffff	22.112095	71.271054	66.291994	3.813333	6.210526
8360f2fffffffff	16.791736	72.376812	66.151014	0.000000	8.206186

A brief check on the distribution of the number of Twitter users¶

Compensating for missing data¶