import re
from urllib import request
from bs4 import BeautifulSoup
from snscrape.modules.twitter import TwitterTweetScraper, TwitterTweetScraperMode
USER_AGENT = 'oryx-scraper/0.0.1'
TIMEOUT = 5
ORYX_URL = 'https://www.oryxspioenkop.com/2022/11/tracking-threat-russias-izdeliye-305.html'
headers = {
'User-Agent': USER_AGENT,
'Accept-Language': 'en-US,en',
}
req = request.Request(
url=ORYX_URL,
headers=headers,
method='GET',
)
res = request.urlopen(
req,
timeout=TIMEOUT,
).read()
soup = BeautifulSoup(res, 'html.parser')
post = soup.find('div', {'class': 'post'}).find('article')
all_links = post.find_all('a', string=re.compile('\(.*\)'))
# all_links[7].__dict__
# all_links[7].text
def extract_twitter_id(url):
return url.split('://')[-1].split('/status/')[-1].split('/')[-1]
mode = TwitterTweetScraperMode.SINGLE
def extract_date(link):
tweet_id = extract_twitter_id(link)
tweet = next(TwitterTweetScraper(
tweet_id,
mode=mode
).get_items())
return tweet.date
def process_links(elements):
for elm in elements:
amount = len(re.findall('(\d+)[, ]?', elm.text))
link = elm.get('href')
date = extract_date(link)
for i in range(amount):
yield [date, link]
# This is a bit slow since it does dozens of network calls
dates = list(process_links(all_links))
# dates
import json
json.dump(dates, open('lmur.json', 'w'), indent=4, default=str)
import pandas as pd
df = pd.DataFrame(dates).rename(columns={0: 'date', 1: 'url'})
# Subtract 3 days to account for delayed publishing of footage
df['date'] = df['date'] - pd.Timedelta(days=3)
df = df.assign(amount=1)
monthly = df.groupby(df['date'].dt.month)['amount'].sum()
monthly
date 6 9 7 21 8 7 9 4 10 9 Name: amount, dtype: int64
df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
/tmp/ipykernel_36205/2853671891.py:1: UserWarning: Converting to PeriodArray/Index representation will drop timezone information. df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
plot = df.groupby('month')['amount'].count().plot(
kind='line', title='LMUR usage in 2022', xlabel='month', ylabel='units fired',
)
fig = plot.get_figure()
fig.savefig("lmur.png")
import matplotlib.pyplot as plt
plt.xlabel('month')
plt.ylabel('units fired')
plt.plot(monthly, label='LMUR missiles in 2022')
plt.legend()
<matplotlib.legend.Legend at 0x7f0dec65b5b0>
# Saves white image?
# plt.savefig('lmur.png')
<Figure size 640x480 with 0 Axes>