gtfs_creation.py


# coding: utf-8

# ## Making GTFS from Payanam data
# 
# Use... not the jsons, but the stops_all.csv and routes.csv generated by reports script.
# 
# Days of week : assume all days
# 
# Timings : take the timings if they exist, else take default timings defined in config/config.json
# 
# v5 : Let's do incremental route-wise saving of stop_times.txt instead of accumulating for each route. AppendOrCreate.

# In[1]:


import pandas as pd
from collections import OrderedDict
import json, os, time, datetime, shutil
import zipfile #, zlib # for making .zip


# and import custom functions
import gtfs_common as gtfsC


# In[2]:


t1 = time.time()


# In[3]:


DEV = False
DEVLimit = 20


# In[4]:


try:
    root = os.path.dirname(__file__) # needed for tornado and all other paths, prog should work even if called from other working directory.
    gtfsFolder = os.path.join(root,'gtfs/') 
    # moving gtfsFolder declaration here because we're using a different path for gtfs output if running in python notebook
except NameError as e:
    root = '../payanam'
    gtfsFolder = 'gtfs-test/'

reportsFolder = os.path.join(root,'reports/')
configFile = os.path.join(root,'config/','config.json')
logFolder = logFolder = os.path.join(root,'reports/logs')
backupsFolder = os.path.join(root,'backups/')

stopsFile = 'stops_all.csv'
routesFile = 'routes.csv'
validationFile = 'gtfs_validation_report.html'


# In[5]:


config = json.load(open(configFile, 'r'))
config


# In[6]:


# create folders if they don't exist
for folder in [gtfsFolder]:
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
        print('Created',folder)


# In[7]:


# 6.8.19 : stop_times incremental: if stop_times.txt exists, then delete it as we want to create a fresh one
if os.path.exists(os.path.join(gtfsFolder,'stop_times.txt')):
    print("Found an existing stop_times.txt in gtfs output path so removing it.")
    os.remove(os.path.join(gtfsFolder,'stop_times.txt'))
    
# oh well lets delete any gtfs.zip also if existing
if os.path.exists(os.path.join(gtfsFolder,'gtfs.zip')):
    print("Found an existing gtfs.zip in gtfs output path so removing it.")
    os.remove(os.path.join(gtfsFolder,'gtfs.zip'))


# In[8]:


default_trip_times = gtfsC.tripTimesProcess( config.get('timeDefaults',{}).get('trip_times','10:00') )
# ah I see what ya did there. you put a default on the default. cheers mate.
default_trip_times


# ## functions
# which can't be moved out yet because they're depending on one of the folder paths declared here.

# In[9]:


def logmessage( *content ):
    global logFolder
    timestamp = '{:%Y-%b-%d %H:%M:%S} :'.format(datetime.datetime.now())
    # from https://stackoverflow.com/a/26455617/4355695
    line = ' '.join(str(x) for x in list(content))
    # str(x) for x in list(content) : handles numbers in the list, converts them to string before concatenating. 
    # from https://stackoverflow.com/a/3590168/4355695
    print(line) # print to screen also
    f = open(os.path.join(logFolder, 'gtfslog.txt'), 'a', newline='\r\n', encoding='utf8') #open in append mode
    print(timestamp, line, file=f)
    # `,file=f` argument at end writes the line, with newline as defined, to the file instead of to screen. 
    # from https://stackoverflow.com/a/2918367/4355695
    f.close()


# In[10]:


def backup(filepath):
    global backupsFolder
    # make timestamp for backup string
    backupSuffix = '_{:%Y%m%d-%H%M%S}'.format(datetime.datetime.now())
    destinationPath = os.path.join(backupsFolder, filepath.replace(root+'/','') + backupSuffix)

    # copy folder paths
    try:
        os.makedirs(os.path.dirname(destinationPath))
    except FileExistsError as e:
        # folder exists
        pass
    
    try: 
        shutil.copy(filepath, destinationPath)
    except Exception as e:
        logmessage('backup: Error in copying {} to {}: {}'.format(filepath,destinationPath,e))
        return False
    
    return backupSuffix


# In[11]:


def appendOrCreate(targetDF,targetCSV):
    if os.path.exists(os.path.join(targetCSV)):
        targetDF.to_csv(os.path.join(targetCSV), mode='a', header=False, index=False)
    else:
        targetDF.to_csv(os.path.join(targetCSV), index=False)


# In[12]:


'#'*70


# ## load up the main input files

# In[13]:


stops_src = pd.read_csv(os.path.join(reportsFolder,stopsFile),dtype=str).fillna('')
stops_src.head()


# In[14]:


routes_src = pd.read_csv(os.path.join(reportsFolder,routesFile),dtype=str).fillna('')
routes_src.head()


# In[15]:


# DEVine intervention
if DEV:
    routes_src = routes_src.iloc[655:655+DEVLimit]


# In[16]:


'#'*70


# ## agency.txt

# In[17]:


agencyDF = pd.DataFrame(data={'agency_id':['TSRTC_HYD'],'agency_name':['Telangana State Road Transport Corporation (Hyderabad local bus)'],    'agency_url': ['http://tsrtconline.in'], 'agency_timezone':['Asia/Kolkata']}) 
agencyDF


# In[18]:


'#'*70


# ## calendar.txt

# In[19]:


calendarDF = pd.DataFrame(data={"service_id":["WK","SA","SU","ALL"],    "monday":['1','0','0','1'], "tuesday":['1','0','0','1'], "wednesday":['1','0','0','1'],    "thursday":['1','0','0','1'], "friday":['1','0','0','1'],    "saturday":['0','1','0','1'], "sunday":['0','0','1','1']})
calendarDF['start_date'] = 20190101
calendarDF['end_date'] = 20221231
calendarDF


# In[20]:


'#'*70


# ## routes.txt

# decide:  
# route_id : {folder}:{jsonFile minus the .json}  
# trip_id : {route_id}:d{direction}:n for 1..n 

# In[21]:


# mutate the routes file itself to make route.txt
routes_src['route_id'] = routes_src.apply(lambda x: "{}::{}".format(x.folder,x.jsonFile[:-5]), axis=1)
routes_src['route_type'] = '3'
routes_src['route_long_name'] = routes_src.apply(lambda x: "{}, {} depot".format(x.routeName,x.folder), axis=1)


# In[22]:


routes_src.head()


# In[23]:


print("Columns in routes.csv:",routes_src.columns)


# In[24]:


routesDF = routes_src[['route_id','routeName','route_long_name','busType','route_type']].rename(index=str,    columns={'routeName':'route_short_name','busType':'route_categories'})

routesDF


# In[25]:


t2 = time.time()
logmessage("Starting + agency + calendar + routes took {} seconds.".format(round(t2-t1,2)))


# ## process stops
# 
# - de-dupe by: zap name, stop_lat, stop_lon
# - assign a stop_id to each such triplet

# In[26]:


stops_uniqueDF = stops_src[(stops_src.stop_lat!='') & (stops_src.stop_lon!='')]    [['stop_name','stop_lat','stop_lon','zap']]    .drop_duplicates(['stop_name','stop_lat','stop_lon'])    .sort_values(['zap','stop_lat']).copy().reset_index(drop=True)
logmessage(len(stops_uniqueDF))
stops_uniqueDF.head()


# In[27]:


stops_uniqueDF['stop_id'] = '' # initiate column with blanks


# In[28]:


# to do next : assign stop_id's, store as stops.txt
for N in range(len(stops_uniqueDF) ) :
    zap = stops_uniqueDF.at[N,'zap']
    suffix = 0
    while True:
        suffix +=1
        stop_id = '{}{}'.format(zap[:6].upper(),suffix)
        if stop_id not in stops_uniqueDF['stop_id'].tolist() :
            break
    stops_uniqueDF.at[N,'stop_id'] = stop_id


# In[29]:


stops_uniqueDF.sample(10)


# In[30]:


stopsDF = stops_uniqueDF[['stop_id','stop_name','stop_lat','stop_lon']]


# In[31]:


t3 = time.time()
logmessage("Stops processing took {} seconds.".format(round(t3-t2,2)))


# In[32]:


'#'*70


# ## trips.txt and stop_times.txt .. and frequencies.txt too

# - Timings: just the defaults for now
# - have to check if a route doesn't have reverse direction then don't provision those trips

# In[33]:


# make route_id in the stops_src DF too
stops_src['route_id'] = stops_src.apply(lambda x: "{}::{}".format(x.folder,x.jsonFile[:-5]), axis=1)


# In[34]:


tripsCollector = []

oneDirList = [] # spinoff data : routes that have only one direction
frequenciesCollector = []

for rN, routeRow in routes_src.iterrows():
    route_id = routeRow['route_id']
    logmessage()
    logmessage(rN, route_id)
    
    stopTimesCollector = [] # 6.8.19: doing stop_times incrementally, so re-initiating this for every route.
    for direction_id in ['0','1']:
        # check for presence of entries
        
        this_sequence = stops_src[(stops_src.route_id == route_id) &             (stops_src.direction_id == direction_id) &             (stops_src.stop_lat != '') & (stops_src.stop_lon != '')].copy().reset_index(drop=True)
        if not len(this_sequence):
            logmessage("Route {}: No data for direction {}".format(route_id,direction_id))
            oneDirList.append(route_id)
            continue
            
        tripLen = len(this_sequence) # this is also of how many stops are actually mapped
        
        # lookup trip_times in routes_src; if nothing there then assume default
        this_trip_times = gtfsC.tripTimesProcess(routeRow['t{}.trip_times'.format(direction_id)])
        
        if not len(this_trip_times):
            # later: here we have to lookup frequency and and if even that's not available, then assume default trip times.
            # changed the way frequency is stored to only-seconds, and t{}.frequency instead of hr,min,sec
            
            if routeRow['t{}.frequency'.format(direction_id)].isdigit():
                # if there's a frequency, then stop_times has only one set of entries and timings are just for giving offset, so advised to start the trip at 00:00:00 hrs
                this_trip_times = ['00:00:00'] # remember to keep it in hh:mm:ss format if hard-coding!
                start_time = gtfsC.timeFormat(routeRow['t{}.first_trip_start'.format(direction_id)])
                end_time = gtfsC.timeFormat(routeRow['t{}.last_trip_start'.format(direction_id)])
                
                # start and end times : if either of these are blank, assume defaults
                if not start_time: 
                    start_time = gtfsC.timeFormat( config.get('timeDefaults',{}).get('first_trip_start','06:00') )
                
                if not end_time:
                    end_time = gtfsC.timeFormat( config.get('timeDefaults',{}).get('last_trip_start','22:00') )
                
                # to do : provision frequency.txt entries here, with start and end time etc
                freqRow = OrderedDict()
                freqRow['trip_id'] = "{}:d{}:1".format(route_id,direction_id) # pre-making the trip_id: the trips loop below will have only one iteration and this will be the id made there as well.
                freqRow['start_time'] = start_time
                freqRow['end_time'] = end_time
                freqRow['headway_secs'] = routeRow['t{}.frequency'.format(direction_id)]
                freqRow['exact_times'] = '0' # not exact.
                frequenciesCollector.append(freqRow.copy())
                
            else:
                # not even trip times, not even frequencies given. So, assume defaults
                logmessage("Route {}, direction {}: No timings data found so assuming default timings.".format(route_id,direction_id))
                this_trip_times = default_trip_times.copy()
        
        else:
            logmessage("Route {}, direction {}: Taking trip times.".format(route_id,direction_id))
        
        # duration and speed: 
        # in absence of duration, use default speed to compute duration.
        # print(routeRow.get('t{}.duration'.format(direction_id),'') )
        
        if not routeRow.get('t{}.duration'.format(direction_id),''):
            this_speed = config.get('timeDefaults',{}).get('defaultSpeed',15)
            this_duration, this_distance = gtfsC.computeDuration(this_sequence, this_speed ) # gets duration in hh:mm:ss, and total distance
        
        else:
            this_duration = gtfsC.timeFormat( routeRow['t{}.duration'.format(direction_id)] )
            this_speed, this_distance = gtfsC.computeSpeed(this_sequence, this_duration ) # gets speed in km/hr
            
        tripTimesArray = gtfsC.timeEngineTrips(this_trip_times,this_duration)
        
        # if route_id == 'CNT::16A': print(tripTimesArray)
        
        logmessage("direction {}: distance: {} km. duration: {}. speed: {}".format(direction_id, this_distance, this_duration, this_speed))
        
        for N, couple in enumerate(tripTimesArray):
            tripRow = OrderedDict({'route_id':route_id})
            tripRow['service_id'] = 'ALL' # assume only one for now
            trip_id = "{}:d{}:{}".format(route_id,direction_id,(N+1))
            tripRow['trip_id'] = trip_id
            tripRow['direction_id'] = direction_id
            # extra:
            tripRow['num_stops'] = len(this_sequence)
            tripRow['duration'] = this_duration
            tripRow['distance'] = this_distance
            tripRow['speed'] = this_speed
            tripRow['start_time'] = couple[0]
            
            tripsCollector.append(tripRow.copy())
            
            # make a df itself?
            # prep up this_sequence to be a stop_times type table
            # nah just iterate, lad
            
            for seqN, seqRow in this_sequence.iterrows():
                stRow = OrderedDict()
                stRow['trip_id'] = trip_id
                if seqN == 0:
                    stRow['arrival_time'] = stRow['departure_time'] = couple[0]
                elif seqN == (len(this_sequence)-1) :
                    stRow['arrival_time'] = stRow['departure_time'] = couple[1]
                else:
                    stRow['arrival_time'] = stRow['departure_time'] = ''
                
                # stop_id : find by matching name, lat, lon
                stRow['stop_id'] = stops_uniqueDF[(stops_uniqueDF.stop_name==seqRow.stop_name ) &                     (stops_uniqueDF.stop_lat  == seqRow.stop_lat ) &                     (stops_uniqueDF.stop_lon  == seqRow.stop_lon )].stop_id.tolist()[0]
                
                stRow['stop_sequence'] = seqN + 1
                stRow['timepoint'] = '0'
                stopTimesCollector.append(stRow.copy() )
                
                # end of sequence / stop_times loop
            # end of tripTimesArray loop
        # end of direction_id loop
    
    # 6.8.19: incrementally store stop_times instead of waiting till end
    stopTimesIncrDF = pd.DataFrame(stopTimesCollector)
    appendOrCreate(stopTimesIncrDF, os.path.join(gtfsFolder,'stop_times.txt'))
    logmessage("Wrote {} lines to stop_times.txt".format(len(stopTimesIncrDF)))
    del stopTimesIncrDF
    del stopTimesCollector
    # end of route loop


# In[35]:


tripsDF = pd.DataFrame(tripsCollector)
tripsDF.head(10)


# In[36]:


frequenciesDF = pd.DataFrame(frequenciesCollector)
frequenciesDF.head()


# In[37]:


# stopTimesDF = pd.DataFrame(stopTimesCollector)
# stopTimesDF.head(10)
# 6.8.19 : changed to incremental instead of all


# In[38]:


t4 = time.time()
logmessage("trips, frequencies and stop_times processing took {} seconds.".format(round(t4-t3,2)))


# In[39]:


'#'*70


# ## saving out the files

# In[40]:


# first, check if there is already a gtfs.zip created, and if so, make backup
if os.path.exists(os.path.join(gtfsFolder,'gtfs.zip')):
    backup(os.path.join(gtfsFolder,'gtfs.zip'))


# In[41]:


os.path.join(gtfsFolder,'gtfs.zip')


# In[42]:


txtCollector = [] # collect the outputs in this so it's easier to zip them up la
zf = zipfile.ZipFile(os.path.join(gtfsFolder,'gtfs.zip'), mode='w')


# In[43]:


# done! saving as agency.txt
agencyDF.to_csv(os.path.join(gtfsFolder+'agency.txt'),index=False)
logmessage('Created agency.txt')
zf.write(os.path.join(gtfsFolder, 'agency.txt' ), arcname='agency.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[44]:


# done! saving as calendar.txt
calendarDF.to_csv(os.path.join(gtfsFolder+'calendar.txt'),index=False)
logmessage('Created calendar.txt')
zf.write(os.path.join(gtfsFolder, 'calendar.txt' ), arcname='calendar.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[45]:


stopsDF.to_csv(os.path.join(gtfsFolder,'stops.txt'),index=False)
logmessage('Created stops.txt, {} entries.'.format(len(stopsDF)))
zf.write(os.path.join(gtfsFolder, 'stops.txt' ), arcname='stops.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[46]:


routesDF.to_csv(os.path.join(gtfsFolder,'routes.txt'),index=False)
logmessage('Created routes.txt, {} entries.'.format(len(routesDF)))
zf.write(os.path.join(gtfsFolder, 'routes.txt' ), arcname='routes.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[47]:


tripsDF.to_csv(os.path.join(gtfsFolder,'trips.txt'),index=False)
logmessage('Created trips.txt, {} entries.'.format(len(tripsDF)))
zf.write(os.path.join(gtfsFolder, 'trips.txt' ), arcname='trips.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[48]:


if len(frequenciesDF):
    frequenciesDF.to_csv(os.path.join(gtfsFolder,'frequencies.txt'),index=False)
    logmessage('Created frequencies.txt, {} entries.'.format(len(frequenciesDF)))
    zf.write(os.path.join(gtfsFolder, 'frequencies.txt' ), arcname='frequencies.txt', compress_type=zipfile.ZIP_DEFLATED )
else:
    logmessage('Did not create frequencies.txt as none provisioned.')


# In[49]:


# stopTimesDF.to_csv(os.path.join(gtfsFolder,'stop_times.txt'),index=False, chunksize=10000)
# logmessage('Created stop_times.txt, {} entries.'.format(len(stopTimesDF)))
# stop_times.txt already made, incrementally

zf.write(os.path.join(gtfsFolder, 'stop_times.txt' ), arcname='stop_times.txt', compress_type=zipfile.ZIP_DEFLATED )


# In[50]:


zf.close()


# In[51]:


logmessage("Zipped them up into gtfs.zip")


# In[52]:


t5 = time.time()
logmessage("writing out all GTFS files took {} seconds.".format(round(t5-t4,2)))
logmessage("The whole GTFS creation script took {} seconds.".format(round(t5-t1,2)))


# In[53]:


# one more backup command : for the validation results html that will be generated right after this script is run
# reports/gtfs_validation_report.html
backup(os.path.join(reportsFolder, validationFile))