-
Notifications
You must be signed in to change notification settings - Fork 6
/
gtfs_creation.py
613 lines (352 loc) · 17.5 KB
/
gtfs_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
# coding: utf-8
# ## Making GTFS from Payanam data
#
# Use... not the jsons, but the stops_all.csv and routes.csv generated by reports script.
#
# Days of week : assume all days
#
# Timings : take the timings if they exist, else take default timings defined in config/config.json
#
# v5 : Let's do incremental route-wise saving of stop_times.txt instead of accumulating for each route. AppendOrCreate.
# In[1]:
import pandas as pd
from collections import OrderedDict
import json, os, time, datetime, shutil
import zipfile #, zlib # for making .zip
# and import custom functions
import gtfs_common as gtfsC
# In[2]:
t1 = time.time()
# In[3]:
DEV = False
DEVLimit = 20
# In[4]:
try:
root = os.path.dirname(__file__) # needed for tornado and all other paths, prog should work even if called from other working directory.
gtfsFolder = os.path.join(root,'gtfs/')
# moving gtfsFolder declaration here because we're using a different path for gtfs output if running in python notebook
except NameError as e:
root = '../payanam'
gtfsFolder = 'gtfs-test/'
reportsFolder = os.path.join(root,'reports/')
configFile = os.path.join(root,'config/','config.json')
logFolder = logFolder = os.path.join(root,'reports/logs')
backupsFolder = os.path.join(root,'backups/')
stopsFile = 'stops_all.csv'
routesFile = 'routes.csv'
validationFile = 'gtfs_validation_report.html'
# In[5]:
config = json.load(open(configFile, 'r'))
config
# In[6]:
# create folders if they don't exist
for folder in [gtfsFolder]:
if not os.path.exists(folder):
os.makedirs(folder, exist_ok=True)
print('Created',folder)
# In[7]:
# 6.8.19 : stop_times incremental: if stop_times.txt exists, then delete it as we want to create a fresh one
if os.path.exists(os.path.join(gtfsFolder,'stop_times.txt')):
print("Found an existing stop_times.txt in gtfs output path so removing it.")
os.remove(os.path.join(gtfsFolder,'stop_times.txt'))
# oh well lets delete any gtfs.zip also if existing
if os.path.exists(os.path.join(gtfsFolder,'gtfs.zip')):
print("Found an existing gtfs.zip in gtfs output path so removing it.")
os.remove(os.path.join(gtfsFolder,'gtfs.zip'))
# In[8]:
default_trip_times = gtfsC.tripTimesProcess( config.get('timeDefaults',{}).get('trip_times','10:00') )
# ah I see what ya did there. you put a default on the default. cheers mate.
default_trip_times
# ## functions
# which can't be moved out yet because they're depending on one of the folder paths declared here.
# In[9]:
def logmessage( *content ):
global logFolder
timestamp = '{:%Y-%b-%d %H:%M:%S} :'.format(datetime.datetime.now())
# from https://stackoverflow.com/a/26455617/4355695
line = ' '.join(str(x) for x in list(content))
# str(x) for x in list(content) : handles numbers in the list, converts them to string before concatenating.
# from https://stackoverflow.com/a/3590168/4355695
print(line) # print to screen also
f = open(os.path.join(logFolder, 'gtfslog.txt'), 'a', newline='\r\n', encoding='utf8') #open in append mode
print(timestamp, line, file=f)
# `,file=f` argument at end writes the line, with newline as defined, to the file instead of to screen.
# from https://stackoverflow.com/a/2918367/4355695
f.close()
# In[10]:
def backup(filepath):
global backupsFolder
# make timestamp for backup string
backupSuffix = '_{:%Y%m%d-%H%M%S}'.format(datetime.datetime.now())
destinationPath = os.path.join(backupsFolder, filepath.replace(root+'/','') + backupSuffix)
# copy folder paths
try:
os.makedirs(os.path.dirname(destinationPath))
except FileExistsError as e:
# folder exists
pass
try:
shutil.copy(filepath, destinationPath)
except Exception as e:
logmessage('backup: Error in copying {} to {}: {}'.format(filepath,destinationPath,e))
return False
return backupSuffix
# In[11]:
def appendOrCreate(targetDF,targetCSV):
if os.path.exists(os.path.join(targetCSV)):
targetDF.to_csv(os.path.join(targetCSV), mode='a', header=False, index=False)
else:
targetDF.to_csv(os.path.join(targetCSV), index=False)
# In[12]:
'#'*70
# ## load up the main input files
# In[13]:
stops_src = pd.read_csv(os.path.join(reportsFolder,stopsFile),dtype=str).fillna('')
stops_src.head()
# In[14]:
routes_src = pd.read_csv(os.path.join(reportsFolder,routesFile),dtype=str).fillna('')
routes_src.head()
# In[15]:
# DEVine intervention
if DEV:
routes_src = routes_src.iloc[655:655+DEVLimit]
# In[16]:
'#'*70
# ## agency.txt
# In[17]:
agencyDF = pd.DataFrame(data={'agency_id':['TSRTC_HYD'],'agency_name':['Telangana State Road Transport Corporation (Hyderabad local bus)'], 'agency_url': ['http://tsrtconline.in'], 'agency_timezone':['Asia/Kolkata']})
agencyDF
# In[18]:
'#'*70
# ## calendar.txt
# In[19]:
calendarDF = pd.DataFrame(data={"service_id":["WK","SA","SU","ALL"], "monday":['1','0','0','1'], "tuesday":['1','0','0','1'], "wednesday":['1','0','0','1'], "thursday":['1','0','0','1'], "friday":['1','0','0','1'], "saturday":['0','1','0','1'], "sunday":['0','0','1','1']})
calendarDF['start_date'] = 20190101
calendarDF['end_date'] = 20221231
calendarDF
# In[20]:
'#'*70
# ## routes.txt
# decide:
# route_id : {folder}:{jsonFile minus the .json}
# trip_id : {route_id}:d{direction}:n for 1..n
# In[21]:
# mutate the routes file itself to make route.txt
routes_src['route_id'] = routes_src.apply(lambda x: "{}::{}".format(x.folder,x.jsonFile[:-5]), axis=1)
routes_src['route_type'] = '3'
routes_src['route_long_name'] = routes_src.apply(lambda x: "{}, {} depot".format(x.routeName,x.folder), axis=1)
# In[22]:
routes_src.head()
# In[23]:
print("Columns in routes.csv:",routes_src.columns)
# In[24]:
routesDF = routes_src[['route_id','routeName','route_long_name','busType','route_type']].rename(index=str, columns={'routeName':'route_short_name','busType':'route_categories'})
routesDF
# In[25]:
t2 = time.time()
logmessage("Starting + agency + calendar + routes took {} seconds.".format(round(t2-t1,2)))
# ## process stops
#
# - de-dupe by: zap name, stop_lat, stop_lon
# - assign a stop_id to each such triplet
# In[26]:
stops_uniqueDF = stops_src[(stops_src.stop_lat!='') & (stops_src.stop_lon!='')] [['stop_name','stop_lat','stop_lon','zap']] .drop_duplicates(['stop_name','stop_lat','stop_lon']) .sort_values(['zap','stop_lat']).copy().reset_index(drop=True)
logmessage(len(stops_uniqueDF))
stops_uniqueDF.head()
# In[27]:
stops_uniqueDF['stop_id'] = '' # initiate column with blanks
# In[28]:
# to do next : assign stop_id's, store as stops.txt
for N in range(len(stops_uniqueDF) ) :
zap = stops_uniqueDF.at[N,'zap']
suffix = 0
while True:
suffix +=1
stop_id = '{}{}'.format(zap[:6].upper(),suffix)
if stop_id not in stops_uniqueDF['stop_id'].tolist() :
break
stops_uniqueDF.at[N,'stop_id'] = stop_id
# In[29]:
stops_uniqueDF.sample(10)
# In[30]:
stopsDF = stops_uniqueDF[['stop_id','stop_name','stop_lat','stop_lon']]
# In[31]:
t3 = time.time()
logmessage("Stops processing took {} seconds.".format(round(t3-t2,2)))
# In[32]:
'#'*70
# ## trips.txt and stop_times.txt .. and frequencies.txt too
# - Timings: just the defaults for now
# - have to check if a route doesn't have reverse direction then don't provision those trips
# In[33]:
# make route_id in the stops_src DF too
stops_src['route_id'] = stops_src.apply(lambda x: "{}::{}".format(x.folder,x.jsonFile[:-5]), axis=1)
# In[34]:
tripsCollector = []
oneDirList = [] # spinoff data : routes that have only one direction
frequenciesCollector = []
for rN, routeRow in routes_src.iterrows():
route_id = routeRow['route_id']
logmessage()
logmessage(rN, route_id)
stopTimesCollector = [] # 6.8.19: doing stop_times incrementally, so re-initiating this for every route.
for direction_id in ['0','1']:
# check for presence of entries
this_sequence = stops_src[(stops_src.route_id == route_id) & (stops_src.direction_id == direction_id) & (stops_src.stop_lat != '') & (stops_src.stop_lon != '')].copy().reset_index(drop=True)
if not len(this_sequence):
logmessage("Route {}: No data for direction {}".format(route_id,direction_id))
oneDirList.append(route_id)
continue
tripLen = len(this_sequence) # this is also of how many stops are actually mapped
# lookup trip_times in routes_src; if nothing there then assume default
this_trip_times = gtfsC.tripTimesProcess(routeRow['t{}.trip_times'.format(direction_id)])
if not len(this_trip_times):
# later: here we have to lookup frequency and and if even that's not available, then assume default trip times.
# changed the way frequency is stored to only-seconds, and t{}.frequency instead of hr,min,sec
if routeRow['t{}.frequency'.format(direction_id)].isdigit():
# if there's a frequency, then stop_times has only one set of entries and timings are just for giving offset, so advised to start the trip at 00:00:00 hrs
this_trip_times = ['00:00:00'] # remember to keep it in hh:mm:ss format if hard-coding!
start_time = gtfsC.timeFormat(routeRow['t{}.first_trip_start'.format(direction_id)])
end_time = gtfsC.timeFormat(routeRow['t{}.last_trip_start'.format(direction_id)])
# start and end times : if either of these are blank, assume defaults
if not start_time:
start_time = gtfsC.timeFormat( config.get('timeDefaults',{}).get('first_trip_start','06:00') )
if not end_time:
end_time = gtfsC.timeFormat( config.get('timeDefaults',{}).get('last_trip_start','22:00') )
# to do : provision frequency.txt entries here, with start and end time etc
freqRow = OrderedDict()
freqRow['trip_id'] = "{}:d{}:1".format(route_id,direction_id) # pre-making the trip_id: the trips loop below will have only one iteration and this will be the id made there as well.
freqRow['start_time'] = start_time
freqRow['end_time'] = end_time
freqRow['headway_secs'] = routeRow['t{}.frequency'.format(direction_id)]
freqRow['exact_times'] = '0' # not exact.
frequenciesCollector.append(freqRow.copy())
else:
# not even trip times, not even frequencies given. So, assume defaults
logmessage("Route {}, direction {}: No timings data found so assuming default timings.".format(route_id,direction_id))
this_trip_times = default_trip_times.copy()
else:
logmessage("Route {}, direction {}: Taking trip times.".format(route_id,direction_id))
# duration and speed:
# in absence of duration, use default speed to compute duration.
# print(routeRow.get('t{}.duration'.format(direction_id),'') )
if not routeRow.get('t{}.duration'.format(direction_id),''):
this_speed = config.get('timeDefaults',{}).get('defaultSpeed',15)
this_duration, this_distance = gtfsC.computeDuration(this_sequence, this_speed ) # gets duration in hh:mm:ss, and total distance
else:
this_duration = gtfsC.timeFormat( routeRow['t{}.duration'.format(direction_id)] )
this_speed, this_distance = gtfsC.computeSpeed(this_sequence, this_duration ) # gets speed in km/hr
tripTimesArray = gtfsC.timeEngineTrips(this_trip_times,this_duration)
# if route_id == 'CNT::16A': print(tripTimesArray)
logmessage("direction {}: distance: {} km. duration: {}. speed: {}".format(direction_id, this_distance, this_duration, this_speed))
for N, couple in enumerate(tripTimesArray):
tripRow = OrderedDict({'route_id':route_id})
tripRow['service_id'] = 'ALL' # assume only one for now
trip_id = "{}:d{}:{}".format(route_id,direction_id,(N+1))
tripRow['trip_id'] = trip_id
tripRow['direction_id'] = direction_id
# extra:
tripRow['num_stops'] = len(this_sequence)
tripRow['duration'] = this_duration
tripRow['distance'] = this_distance
tripRow['speed'] = this_speed
tripRow['start_time'] = couple[0]
tripsCollector.append(tripRow.copy())
# make a df itself?
# prep up this_sequence to be a stop_times type table
# nah just iterate, lad
for seqN, seqRow in this_sequence.iterrows():
stRow = OrderedDict()
stRow['trip_id'] = trip_id
if seqN == 0:
stRow['arrival_time'] = stRow['departure_time'] = couple[0]
elif seqN == (len(this_sequence)-1) :
stRow['arrival_time'] = stRow['departure_time'] = couple[1]
else:
stRow['arrival_time'] = stRow['departure_time'] = ''
# stop_id : find by matching name, lat, lon
stRow['stop_id'] = stops_uniqueDF[(stops_uniqueDF.stop_name==seqRow.stop_name ) & (stops_uniqueDF.stop_lat == seqRow.stop_lat ) & (stops_uniqueDF.stop_lon == seqRow.stop_lon )].stop_id.tolist()[0]
stRow['stop_sequence'] = seqN + 1
stRow['timepoint'] = '0'
stopTimesCollector.append(stRow.copy() )
# end of sequence / stop_times loop
# end of tripTimesArray loop
# end of direction_id loop
# 6.8.19: incrementally store stop_times instead of waiting till end
stopTimesIncrDF = pd.DataFrame(stopTimesCollector)
appendOrCreate(stopTimesIncrDF, os.path.join(gtfsFolder,'stop_times.txt'))
logmessage("Wrote {} lines to stop_times.txt".format(len(stopTimesIncrDF)))
del stopTimesIncrDF
del stopTimesCollector
# end of route loop
# In[35]:
tripsDF = pd.DataFrame(tripsCollector)
tripsDF.head(10)
# In[36]:
frequenciesDF = pd.DataFrame(frequenciesCollector)
frequenciesDF.head()
# In[37]:
# stopTimesDF = pd.DataFrame(stopTimesCollector)
# stopTimesDF.head(10)
# 6.8.19 : changed to incremental instead of all
# In[38]:
t4 = time.time()
logmessage("trips, frequencies and stop_times processing took {} seconds.".format(round(t4-t3,2)))
# In[39]:
'#'*70
# ## saving out the files
# In[40]:
# first, check if there is already a gtfs.zip created, and if so, make backup
if os.path.exists(os.path.join(gtfsFolder,'gtfs.zip')):
backup(os.path.join(gtfsFolder,'gtfs.zip'))
# In[41]:
os.path.join(gtfsFolder,'gtfs.zip')
# In[42]:
txtCollector = [] # collect the outputs in this so it's easier to zip them up la
zf = zipfile.ZipFile(os.path.join(gtfsFolder,'gtfs.zip'), mode='w')
# In[43]:
# done! saving as agency.txt
agencyDF.to_csv(os.path.join(gtfsFolder+'agency.txt'),index=False)
logmessage('Created agency.txt')
zf.write(os.path.join(gtfsFolder, 'agency.txt' ), arcname='agency.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[44]:
# done! saving as calendar.txt
calendarDF.to_csv(os.path.join(gtfsFolder+'calendar.txt'),index=False)
logmessage('Created calendar.txt')
zf.write(os.path.join(gtfsFolder, 'calendar.txt' ), arcname='calendar.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[45]:
stopsDF.to_csv(os.path.join(gtfsFolder,'stops.txt'),index=False)
logmessage('Created stops.txt, {} entries.'.format(len(stopsDF)))
zf.write(os.path.join(gtfsFolder, 'stops.txt' ), arcname='stops.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[46]:
routesDF.to_csv(os.path.join(gtfsFolder,'routes.txt'),index=False)
logmessage('Created routes.txt, {} entries.'.format(len(routesDF)))
zf.write(os.path.join(gtfsFolder, 'routes.txt' ), arcname='routes.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[47]:
tripsDF.to_csv(os.path.join(gtfsFolder,'trips.txt'),index=False)
logmessage('Created trips.txt, {} entries.'.format(len(tripsDF)))
zf.write(os.path.join(gtfsFolder, 'trips.txt' ), arcname='trips.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[48]:
if len(frequenciesDF):
frequenciesDF.to_csv(os.path.join(gtfsFolder,'frequencies.txt'),index=False)
logmessage('Created frequencies.txt, {} entries.'.format(len(frequenciesDF)))
zf.write(os.path.join(gtfsFolder, 'frequencies.txt' ), arcname='frequencies.txt', compress_type=zipfile.ZIP_DEFLATED )
else:
logmessage('Did not create frequencies.txt as none provisioned.')
# In[49]:
# stopTimesDF.to_csv(os.path.join(gtfsFolder,'stop_times.txt'),index=False, chunksize=10000)
# logmessage('Created stop_times.txt, {} entries.'.format(len(stopTimesDF)))
# stop_times.txt already made, incrementally
zf.write(os.path.join(gtfsFolder, 'stop_times.txt' ), arcname='stop_times.txt', compress_type=zipfile.ZIP_DEFLATED )
# In[50]:
zf.close()
# In[51]:
logmessage("Zipped them up into gtfs.zip")
# In[52]:
t5 = time.time()
logmessage("writing out all GTFS files took {} seconds.".format(round(t5-t4,2)))
logmessage("The whole GTFS creation script took {} seconds.".format(round(t5-t1,2)))
# In[53]:
# one more backup command : for the validation results html that will be generated right after this script is run
# reports/gtfs_validation_report.html
backup(os.path.join(reportsFolder, validationFile))