import json
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from datetime import datetime
from multiprocessing.pool import ThreadPool
from shapely.geometry import Polygon, MultiPolygon, shape
SAMPLE_FREQ = 3
plt.rcParams['figure.figsize'] = [10, 5]
import seaborn as sns
sns.set_theme()
ACLED_SPREADSHEET = 'Ukraine_Black_Sea_2020_2023_Mar10.xlsx'
ACLED_CSV = 'Ukraine_Black_Sea_2020_2023_Mar10.csv'
acled = pd.read_excel(ACLED_SPREADSHEET)
# acled
#acled.columns
acled['date'] = pd.to_datetime(acled.EVENT_DATE)
acled = acled.set_index('date')
war = acled['2022-02-24':]
#war.ACTOR1.value_counts().head(40)
This is not 100% accurate but the stragglers so few they're not relevant. Only "Unidentified Military Forces" may be skewing results a bit.
not_ukraine = war[~war['ACTOR1'].str.contains('Ukraine')]
# not_ukraine.ACTOR1.value_counts().head(40)
# not_ukraine.EVENT_TYPE.value_counts().head(40)
# not_ukraine.SUB_EVENT_TYPE.value_counts().head(40)
strikes = not_ukraine[not_ukraine.SUB_EVENT_TYPE == 'Air/drone strike']
# strikes
# strikes.NOTES.values
# Dummy var - maybe smarter to to .count() instead of summing dummy vars
strikes['dummy'] = 1
/tmp/ipykernel_11346/4090086438.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy strikes['dummy'] = 1
plt.figure()
plt.title(f'Air strikes, resampled to {SAMPLE_FREQ} days')
strikes['dummy'].resample('1d').sum().plot(xlabel='', ylabel='Number of air strikes')
plt.savefig('acled_air_strikes.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_air_strikes.png', bbox_inches='tight', dpi=600)
# Take _any_ column, they all evaluate to the same value
strikes.groupby(strikes.index).count()['ISO'].resample('M').sum().plot.bar()
<AxesSubplot: xlabel='date'>
missile_strikes = strikes[strikes['NOTES'].str.contains('missile|rocket')]
missile_strikes['dummy'] = 1
/tmp/ipykernel_11346/3808476491.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy missile_strikes['dummy'] = 1
missile_strikes['dummy'].resample(str(SAMPLE_FREQ) + 'd').sum().plot()
<AxesSubplot: xlabel='date'>
missile_strikes['dummy'].to_csv('acled_missiles_dummy.csv')
plt.figure()
plt.title(f'Missile strikes, resampled to {SAMPLE_FREQ} days')
missile_strikes['dummy'].resample('3d').sum().plot(xlabel='', ylabel='Number of missile strikes')
plt.savefig('acled_missile_strikes.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missile_strikes.png', bbox_inches='tight', dpi=600)
strikes_comparison = pd.concat(
[missile_strikes['dummy'].resample('3d').sum(), strikes['dummy'].resample('3d').sum()],
axis=1,
)
strikes_comparison.columns = ['missile_strikes', 'strikes']
strikes_comparison = strikes_comparison.sort_index()
plt.figure()
plt.title(f'Total air strikes vs missile strikes [resampled to {SAMPLE_FREQ} days]')
#missiles['dummy'].resample('3d').sum().plot(xlabel='', ylabel='Number of strikes')
#strikes['dummy'].resample('3d').sum().plot(xlabel='')
ax = strikes_comparison['missile_strikes'].plot()
strikes_comparison['strikes'].plot(ax=ax)
ax.legend(['Missiles', 'Strikes'])
plt.savefig('acled_air_vs_missile_strikes.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_air_vs_missile_strikes.png', bbox_inches='tight', dpi=600)
strikes_comparison.corr().iloc[0, 1]
0.1788722721999166
# strikes.SOURCE.value_counts().head(30)
Also counting missiles as part of regular battles, as they're not exclusively coded to "Air" events
missiles = not_ukraine[not_ukraine['NOTES'].str.contains('missile|rocket')]
missiles = missiles.sort_index()
#missiles.resample('1d').apply({'ISO': 'count'})
#missiles.groupby(lambda x: x.date).count()
missiles['ISO'].resample('1d').count().plot()
missiles['ISO'].resample('1w').count().plot()
missiles['ISO'].resample('1m').count().plot()
plt.savefig('acled_missiles_day-week-month.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_day-week-month.png', bbox_inches='tight', dpi=600)
# missiles.NOTES.values[:100]
missile_counts = missiles['ISO'].resample('1d').count()
missile_counts.rolling(7).mean().plot()
<AxesSubplot: xlabel='date'>
analysis = pd.DataFrame(
{'count': missiles['ISO'].resample('1d').count().values},
index=missiles['ISO'].resample('1d').count().index,
)
analysis.rolling(3, min_periods=1, center=True).mean()
count | |
---|---|
date | |
2022-02-24 | 19.500000 |
2022-02-25 | 15.000000 |
2022-02-26 | 7.666667 |
2022-02-27 | 9.333333 |
2022-02-28 | 11.333333 |
... | ... |
2023-03-05 | 0.333333 |
2023-03-06 | 0.333333 |
2023-03-07 | 1.000000 |
2023-03-08 | 4.000000 |
2023-03-09 | 5.500000 |
379 rows × 1 columns
# analysis[analysis['count'] != 0].index
# analysis[analysis['count'] != 0].rolling(3, min_periods=2, center=True).apply(distance)
def dataframe_roll(df):
def my_fn(window_series):
return (window_series.index[-1] - window_series.index[0]).days
# return window_series[0]
# Note: you can do any kind of offset here
#window_df = df[(df.index >= window_series.index[0]) & (df.index <= window_series.index[-1])]
#print(window_df['count'].index)
#return window_df['count'][0]
return my_fn
analysis[analysis['count'] != 0]\
.rolling(3, min_periods=2, center=True)\
.apply(dataframe_roll(analysis), raw=False)\
.resample('7d').mean()\
.plot()
<AxesSubplot: xlabel='date'>
length = missile_counts.index.shape[0]
ax = plt.gca()
ax.set_ylim([0, 45])
plt.scatter(
missile_counts.index,
missile_counts, # np.zeros(length),
s=[x*100 for x in missile_counts],
alpha=0.2,
c=[-x*100 for x in missile_counts], # np.random.rand(length),
)
plt.savefig('acled_missiles_scatter_2d.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_scatter_2d.png', bbox_inches='tight', dpi=600)
plt.scatter(
missile_counts.index,
np.zeros(length),
s=[x*100 for x in missile_counts],
alpha=0.2,
c=[-x*100 for x in missile_counts], # np.random.rand(length),
)
plt.savefig('acled_missiles_scatter_1d.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_scatter_1d.png', bbox_inches='tight', dpi=600)
# Define barrages as having at least 10 strikes a day
BARRAGE_CUTOFF = 10
analysis['barrages'] = analysis['count'].apply(lambda x: x if x > BARRAGE_CUTOFF else np.nan)
analysis
count | barrages | |
---|---|---|
date | ||
2022-02-24 | 30 | 30.0 |
2022-02-25 | 9 | NaN |
2022-02-26 | 6 | NaN |
2022-02-27 | 8 | NaN |
2022-02-28 | 14 | 14.0 |
... | ... | ... |
2023-03-05 | 0 | NaN |
2023-03-06 | 0 | NaN |
2023-03-07 | 1 | NaN |
2023-03-08 | 2 | NaN |
2023-03-09 | 9 | NaN |
379 rows × 2 columns
length = analysis['barrages'].index.shape[0]
plt.scatter(
analysis['barrages'].index,
np.zeros(length),
s=[x*100 for x in analysis['barrages']],
alpha=0.3,
c=[-x*100 for x in analysis['barrages']], # np.random.rand(length),
)
plt.savefig('acled_missiles_barrages_scatter_1d.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_barrages_scatter_1d.png', bbox_inches='tight', dpi=600)
length = analysis['barrages'].index.shape[0]
ax = plt.gca()
ax.set_ylim([0, 50])
plt.scatter(
analysis['barrages'].index,
analysis['barrages'],
s=[x*100 for x in analysis['barrages']],
alpha=0.4,
c=[-x*100 for x in analysis['barrages']],
)
plt.savefig('acled_missiles_barrages_scatter_2d.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_barrages_scatter_2d.png', bbox_inches='tight', dpi=600)
sns.relplot(
data=analysis,
x='date', y='barrages',
hue='barrages',
size='barrages'
)
<seaborn.axisgrid.FacetGrid at 0x7f1da9ae6830>
from pandas.plotting import autocorrelation_plot
plt.figure();
autocorrelation_plot(analysis['count'])
plt.savefig('acled_missiles_autocorrelation.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_autocorrelation.png', bbox_inches='tight', dpi=600)
from pandas.plotting import bootstrap_plot
bootstrap_plot(analysis['count'], size=50, samples=500, color="grey");
sns.relplot(data=analysis, x='date', y='count', kind="line", errorbar='sd')
<seaborn.axisgrid.FacetGrid at 0x7f1da9ae7b20>
analysis['date'] = analysis.index.to_pydatetime()
analysis['date'] = analysis['date'].apply(lambda x: int(x.timestamp()))
sns.lmplot(data=analysis, x='date', y='count', lowess=True, line_kws={"color": "C1"})
# XXX This is a hack and discouraged by seaborn authors
# https://stackoverflow.com/questions/40558128/using-datetimes-with-seaborns-regplot
ax = plt.gca()
xticks = ax.get_xticks()
plt.xticks(rotation=45)
xticks_dates = [datetime.fromtimestamp(x).strftime('%Y-%m-%d') for x in xticks]
ax.set_xticklabels(xticks_dates)
plt.savefig('acled_missiles_reg_lowess.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_reg_lowess.png', bbox_inches='tight', dpi=600)
/tmp/ipykernel_11346/3192376083.py:10: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(xticks_dates)
sns.residplot(data=analysis, x='date', y='count', line_kws={"color": "C1"})
ax = plt.gca()
xticks = ax.get_xticks()
plt.xticks(rotation=45)
xticks_dates = [datetime.fromtimestamp(x).strftime('%Y-%m-%d') for x in xticks]
ax.set_xticklabels(xticks_dates)
/tmp/ipykernel_11346/219493459.py:7: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(xticks_dates)
[Text(1640000000.0, 0, '2021-12-20'), Text(1645000000.0, 0, '2022-02-16'), Text(1650000000.0, 0, '2022-04-15'), Text(1655000000.0, 0, '2022-06-12'), Text(1660000000.0, 0, '2022-08-09'), Text(1665000000.0, 0, '2022-10-05'), Text(1670000000.0, 0, '2022-12-02'), Text(1675000000.0, 0, '2023-01-29'), Text(1680000000.0, 0, '2023-03-28')]
analysis['cumulative'] = analysis['count'].cumsum()
analysis
count | barrages | date | cumulative | |
---|---|---|---|---|
date | ||||
2022-02-24 | 30 | 30.0 | 1645660800 | 30 |
2022-02-25 | 9 | NaN | 1645747200 | 39 |
2022-02-26 | 6 | NaN | 1645833600 | 45 |
2022-02-27 | 8 | NaN | 1645920000 | 53 |
2022-02-28 | 14 | 14.0 | 1646006400 | 67 |
... | ... | ... | ... | ... |
2023-03-05 | 0 | NaN | 1677974400 | 1612 |
2023-03-06 | 0 | NaN | 1678060800 | 1612 |
2023-03-07 | 1 | NaN | 1678147200 | 1613 |
2023-03-08 | 2 | NaN | 1678233600 | 1615 |
2023-03-09 | 9 | NaN | 1678320000 | 1624 |
379 rows × 4 columns
plt.figure()
analysis['cumulative'].plot()
plt.savefig('acled_missiles_cumulative.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_cumulative.png', bbox_inches='tight', dpi=600)
sns.lmplot(data=analysis, x='date', y='cumulative', line_kws={"color": "C1"})
ax = plt.gca()
xticks = ax.get_xticks()
plt.xticks(rotation=45)
xticks_dates = [datetime.fromtimestamp(x).strftime('%Y-%m-%d') for x in xticks]
ax.set_xticklabels(xticks_dates)
plt.savefig('acled_missiles_reg_cumulative.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_missiles_reg_cumulative.png', bbox_inches='tight', dpi=600)
/tmp/ipykernel_11346/2189466815.py:7: UserWarning: FixedFormatter should only be used together with FixedLocator ax.set_xticklabels(xticks_dates)
plt.figure()
plt.title(f'Total air strikes vs missile all missile events')
ax = analysis['count'].plot()
strikes_comparison['strikes'].plot(ax=ax)
ax.legend(['Missile events', 'Air strikes'])
plt.savefig('acled_air_strikes_vs_missile_events.svg', bbox_inches='tight', dpi=600)
plt.savefig('acled_air_strikes_vs_missile_events.png', bbox_inches='tight', dpi=600)