This repository has been archived by the owner on Feb 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
insinc.py
219 lines (190 loc) · 10.3 KB
/
insinc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
import re
from collections import OrderedDict
from datetime import date, timedelta, datetime
from itertools import groupby, chain
import pendulum
import yaml
from bs4 import BeautifulSoup
from common import VideoProvider, VideoMetadata, TimeCode, adjust_timecode, timecode_to_seconds, PreparedVideoInfo, \
is_root_clip, group_root_and_subclips, shift_timecodes
from ffmpeg import download_mms, clip_video
class InsIncVideoClip(object):
def __init__(self, category, title, mms_url, for_date, start_time, end_time):
self.category = category
self.title = title
self.mms_url = mms_url
self.for_date = for_date
self.start_time = start_time
self.end_time = end_time
def __str__(self):
return "{}: '{} ({:%Y-%m-%d} at {})'".format(self.category, self.title, self.for_date, self.start_time)
def timestamp_to_timedelta(val):
return timedelta(hours=int(val[0:2]), minutes=int(val[3:5]), seconds=int(val[6:]))
class InsIncScraperApi(VideoProvider):
def __init__(self, search_url, tz='America/Vancouver'):
super().__init__(search_url)
self.tz = tz
def available_dates(self, start_date: date, end_date: date):
"""
Get dates with videos available, within the given date range (inclusive).
Note that the server may respond oddly: a date returned by this method may yield no videos
when used as an argument to :meth:`get_metadata`. This implies that there actually is a recording
for that date, but it's grouped under some nearby previous date.
There may also not actually be any recordings for that date, but a clip was mis-dated on the server.
"""
first_of_month = pendulum.Date.instance(start_date).replace(day=1)
while first_of_month < end_date:
for available_date in self.get_available_dates(first_of_month.year, first_of_month.month):
if available_date < start_date or available_date > end_date:
continue
yield available_date
first_of_month = first_of_month.add(months=1)
def get_metadata(self, for_date):
"""
Note that querying for video metadata on a particular date may yield video clips dated for other days.
Their video MMS URLs will match the specified date, but their actual dates can differ.
"""
for mms_url, clips in group_clips(self.get_clips(for_date)).items():
clips = list(clips)
if clips[0].title.startswith('Due to Technical Difficulties'):
continue
# Langley sometimes has meetings that don't have a clip encompassing the entire meeting.
# Create a dummy 'entire meeting' clip that does.
if not any(map(lambda x: is_root_clip(x.title), clips)):
fake_root = InsIncVideoClip(
clips[0].category,
'Entire Meeting',
clips[0].mms_url,
clips[0].for_date,
min(map(lambda x: x.start_time, clips)),
max(map(lambda x: x.start_time, clips)),
)
clips.insert(0, fake_root)
for root, subclips in group_root_and_subclips(clips).items():
start_ts = pendulum.combine(root.for_date, pendulum.parse(root.start_time).time()).tz_(self.tz)
timecodes = [TimeCode(c.start_time, c.title, c.end_time) for c in subclips]
if not timecodes:
timecodes.append(TimeCode(root.start_time, root.title, root.end_time))
yield VideoMetadata(
video_id=os.path.basename(root.mms_url).replace('.wmv', ''),
category=root.category,
title=root.title,
url=root.mms_url,
start_ts=start_ts.isoformat(),
end_ts=root.end_time,
timecodes=timecodes,
)
def download(self, mms_url, destination_dir):
dest_file_path = os.path.join(destination_dir, os.path.basename(mms_url))
if os.path.exists(dest_file_path):
print("Already exists: " + dest_file_path)
return
start_time = datetime.now()
print("Starting download of {} on {}".format(mms_url, start_time.isoformat()))
download_mms(mms_url, dest_file_path)
end_time = datetime.now()
elapsed = end_time - start_time
print("Download of {} completed on {} in {} seconds".format(
mms_url, end_time.isoformat(), elapsed.total_seconds()))
def postprocess(self, video_metadata, download_dir, destination_dir, **kwargs):
filename_from_video_url = os.path.basename(video_metadata.url)
video_path = os.path.join(download_dir, filename_from_video_url)
if not os.path.exists(video_path):
raise ValueError("{} doesn't exist".format(video_path))
start_timestamp = pendulum.parse(video_metadata.start_ts)
start_timecode = min(chain([m.start_ts for m in video_metadata.timecodes], [start_timestamp.to_time_string()]))
end_timecode = max(chain([m.end_ts for m in video_metadata.timecodes], [video_metadata.end_ts]))
start_timecode = adjust_timecode(start_timecode, -2)
end_timecode = adjust_timecode(end_timecode, 2)
shift_timecodes(video_metadata.timecodes, start_timecode)
filename_parts = filename_from_video_url.split('.')
filename_parts.insert(len(filename_parts)-1, '{}_{}'.format(
start_timecode.replace(':', ''), end_timecode.replace(':', '')))
final_video_filename = '.'.join(filename_parts)
dest_file = os.path.join(destination_dir, final_video_filename)
prepped_video_info = PreparedVideoInfo(video_metadata, final_video_filename)
with open(dest_file + '.yaml', 'w') as outf:
yaml.dump(prepped_video_info, outf)
if os.path.exists(dest_file):
print(dest_file + " already exists")
else:
clip_video(video_path, start_timecode, end_timecode, dest_file)
return prepped_video_info
def _search(self, url, rs, rsargs):
resp = self.session.post(url, data={
'rs': rs,
'rsargs[]': rsargs,
})
resp.raise_for_status()
return resp
def get_available_dates(self, year, month):
"""
Get dates for which videos are available. Dates are in local time.
"""
print("Getting available dates in {}-{}".format(year, month))
resp = self._search(self.provider_url + '/meeting_search.php', 'show_calendar', [year, str(month).zfill(2)])
for match in re.finditer(r"javascript: write_date_string\(\\'(\d+)-(\d+)-(\d+)\\'\)", resp.text):
y, m, d = int(match.group(1)), int(match.group(2)), int(match.group(3))
dt = pendulum.Date(y, m, d)
yield dt
def get_clips(self, for_date: date):
"""
Get all available video clips for the given local date.
"""
# The '_sl' suffix yields mms:// URLs.
resp = self._search(self.provider_url + '/meeting_search_sl.php',
'search_clips_sl', ['', for_date.strftime('%Y-%m-%d'), ''])
start_bit, end_bit = "+:var res = { \"result\": '", "'}; res;"
body = '"{}"'.format(resp.text[len(start_bit):-1 - len(end_bit)])
body = body.replace("\\n", "\n").replace("\\'", "'").replace('\\"', '"')
parsed_html = BeautifulSoup(body, 'html.parser')
category = None
for element in parsed_html.select('td.gameDate'):
strong, a_link = element.find('strong'), element.find('a')
if strong:
category = str(strong.string).strip()
elif a_link:
# Back up to previous <td> and grab the date.
# Asking for videos on a particular date may yield videos that are for nearby dates,
# but on the same date according to the video URL.
actual_date = str(list(element.previous_siblings)[1].string).strip()
actual_date = pendulum.parse(actual_date).date()
href = a_link['href']
match = re.match(
r"javascript:reload_media_sl\('(mms://[\w\-./]+)', '(\d+:\d+:\d+)', '(\d+:\d+:\d+)'\)", href)
if not match:
continue
mms_url, start_time, end_time = match.group(1), match.group(2), match.group(3)
if start_time == '41:09:00':
start_time = '00:41:09'
title = str(element.string).strip()
yield InsIncVideoClip(category, title, mms_url, actual_date, start_time, end_time)
def group_clips(clips) -> dict:
groups = OrderedDict()
for mms_url, grouped_clips in groupby(clips, key=lambda clip: clip.mms_url):
# First, break any ties with root clip start times. Ensure root clips come first.
grouped_clips = list(grouped_clips)
# for i, clip in enumerate(clips):
# if is_root_clip(clip.title) and i != 0:
# clip.start_time = adjust_timecode(clips[i-1].start_time, -2)
if is_root_clip(grouped_clips[0].title):
ordered_clips = [grouped_clips[0]]
ordered_clips[1:] = sorted(grouped_clips[1:], key=lambda clip: clip.start_time)
else:
ordered_clips = sorted(grouped_clips, key=lambda clip: clip.start_time)
if len(ordered_clips) > 1 and not is_root_clip(ordered_clips[0].title):
if is_root_clip(ordered_clips[1].title, 'opening remarks'):
ordered_clips[0].start_time, ordered_clips[1].start_time = ordered_clips[1].start_time, ordered_clips[0].start_time
elif is_root_clip(ordered_clips[-1].title):
ordered_clips[-1].start_time = adjust_timecode(ordered_clips[0].start_time, -1)
elif is_root_clip(ordered_clips[-2].title):
ordered_clips[-2].start_time = adjust_timecode(ordered_clips[0].start_time, -1)
ordered_clips = sorted(grouped_clips, key=lambda clip: clip.start_time)
dupes_removed = [ordered_clips[0]]
for i, clip in enumerate(ordered_clips[1:], start=1):
if clip.title == ordered_clips[i-1].title:
continue
dupes_removed.append(clip)
groups[mms_url] = dupes_removed
return groups