-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing.py
1017 lines (931 loc) · 53.9 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import tarfile
import networkx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
"""
Author: Shilpa Kancharla
Last Modified: November 21, 2021
"""
class NetworkInput:
"""
Parameters in NetworkInput class (used for data processing):
1. data_directory: Specifies if training, validation, or test directory is in use
2. topology_size: Size of network.
3. input_filepath: Contains simulation number, the topology file, and the routing file used for that simulation.
4. traffic_filepath: Contains the traffic parameters used by the simulator to generate the traffic for each iteration
5. link_filepath: Denotes if there is link between each src-dst pair. If there link exists, it contains the outgoing
port statistics separated by comma (,) or -1 otherwise.
6. graph_filepath: directory of graph files
7. sim_filepath: Contains the measurements obtained by our network simulator for every sample.
8. output_name: Output name of .csv file
Create the following objects from input data:
1. simulation_numbers
2. list_of_traffic_measurements
3. global_packets_list
4. global_losses_list
5. global_delay_list
6. metrics_list
7. topology_object
8. port_statistics_list
"""
def __init__(self, data_directory, topology_size, input_filepath, traffic_filepath, link_filepath,
graph_filepath, sim_filepath, output_name):
self.data_directory = data_directory
self.topology_size = topology_size
self.input_filepath = input_filepath
self.traffic_filepath = traffic_filepath
self.link_filepath = link_filepath
self.graph_filepath = graph_filepath
self.sim_filepath = sim_filepath
self.output_name = output_name
# Data extraction and collection
self.simulation_numbers, routing_matrices = self.process_input_file(self.input_filepath)
#self.routes = self.process_routing_matrix(routing_matrices)
self.list_of_traffic_measurements = self.get_traffic_metrics(self.traffic_filepath)
self.global_packets_list, self.global_losses_list, self.global_delay_list, self.metrics_list = self.get_simulation_metrics(self.sim_filepath)
self.topology_object = self.graph_process(graph_filepath)
self.port_statistics_list = self.get_link_usage_metrics(self.link_filepath)
"""
Converting data collected and processed from simulationResults.txt, traffic.txt, and linkUsage.txt to dataframes, which are then
converted to .csv files.
"""
def write_to_csv(self):
# Processing data from simulationResults.txt
frames_metrics = []
for i in range(0, len(self.metrics_list)):
df_temp = pd.DataFrame(self.metrics_list[i])
frames_metrics.append(df_temp)
# Dataframe of simulationResults.txt - concatenation
metrics_result_df = pd.concat(frames_metrics, ignore_index = True)
# Processing data from traffic.txt
frames_traffic = []
dropped_idx = [] # Keep track of the dropped indices
counter = 0
for data in self.list_of_traffic_measurements:
for key in data:
counter = counter + 1
if not data[key]: # Do not include empty dictionaries in dataframe
dropped_idx.append(counter - 1) # Index the keys of the dictionary
break
# Read dataframe with max avg lambda as index, then reset index to column
df_temp = pd.DataFrame.from_dict(data, orient = 'index').reset_index().rename(columns = {'index': 'Max Avg Lambda'})
# Flatten the time distribution parameters and size distribution parameters, join with dataframe
df_temp = df_temp.join(pd.json_normalize(df_temp['Time Distribution Parameters']))
df_temp = df_temp.join(pd.json_normalize(df_temp['Size Distribution Parameters']))
# Remove redundant columns
df_temp = df_temp.drop(columns = ['Time Distribution Parameters', 'Size Distribution Parameters'])
frames_traffic.append(df_temp)
# Dataframe of traffic.txt - concatenation
traffic_result_df = pd.concat(frames_traffic, ignore_index = True)
# Processing data from linkUsage.txt, dataframe of linkUsage.txt
df_port = pd.DataFrame(self.port_statistics_list, columns = ['Link Exists',
'Avg Utilization',
'Avg Packet Loss',
'Avg Packet Length',
'Avg Utilization First',
'Avg Packet Loss Rate',
'Avg Port Occupancy',
'Max Queue Occupancy',
'Avg Packet Length First'])
# Drop the indices of the unmatched frames for port and metrics dataframes
df_port_mod = df_port.drop(labels = dropped_idx, axis = 0).reset_index()
metrics_result_df_mod = metrics_result_df.drop(labels = dropped_idx, axis = 0).reset_index()
# Join the dataframes together
frames = [metrics_result_df_mod, traffic_result_df, df_port_mod]
df_features_truth = pd.concat(frames, axis = 1).drop(['index'], axis = 1)
# Create .csv file, place in tabular_data folder
filename = 'tabular_data/' + str(self.topology_size) + '/' + self.output_name + '.csv'
df_features_truth.to_csv(filename, index = False)
"""
Return the traffic metrics as a dictionary with the maximum average lambda value as the key and the
metrics for each simulation as values.
@param filepath: traffic file for simulation
@return list of traffic measurements with description of time and size distribution parameters
"""
def get_traffic_metrics(self, filepath):
traffic_file = open(filepath)
max_avg_lambda_list = []
traffic_measurements = []
for line in traffic_file:
traffic_tokens = line.split()
max_avg_lambda = None
for token in traffic_tokens:
traffic_data = token.split(';')
max_avg_lambda, rest_of_token = self.get_max_avg_lambda(traffic_data[0])
first_tokens = rest_of_token.split(',')
tokens = []
for t in first_tokens:
tokens.append(t)
traffic_measurements.append(tokens)
modified_tokens = traffic_data[1:]
for m in modified_tokens:
rest_to_add = m.split(',')
tokens_ = []
for r in rest_to_add:
if r == None:
continue
else:
tokens_.append(r)
max_avg_lambda_list.append(max_avg_lambda)
traffic_measurements.append(tokens_)
traffic_file.close() # Close file once done processing
traffic_measurements_modified = [x for x in traffic_measurements if x != ['']] # Remove empty lists from list
# Match up the max avg lambda value with the tokens using list comprehension
list_dict_with_max_lambda = []
for m, t in zip(max_avg_lambda_list, traffic_measurements_modified):
match_dict = dict()
t_parameterized = self.get_time_size_distribution_parameters(t)
match_dict[m] = t_parameterized
list_dict_with_max_lambda.append(match_dict)
return list_dict_with_max_lambda
"""
Get and process the time and size distribution parameters.
@param traffic_measurement: traffic measurement to process
@return dictionary of traffic measurement with appropriate time and size distribution parameters
"""
def get_time_size_distribution_parameters(self, traffic_measurement):
traffic_dictionary = dict()
for i in range(len(traffic_measurement)):
offset = self.create_traffic_time_distribution(traffic_measurement, traffic_dictionary)
if offset != -1: # Go through this loop if we have a valid offset
self.create_traffic_size_distribution(traffic_measurement, offset, traffic_dictionary)
return traffic_dictionary
"""
Fill out dictionary with traffic time distribution metrics and return the offset of where to read size distribution parameters.
@param traffic_metrics: list of all the flow traffic metrics to be processed
@param traffic_dictionary: dictionary to fill out with the time distribution information
@return number of elements read from the list of parameters data
"""
def create_traffic_time_distribution(self, traffic_metrics, traffic_dictionary):
if int(traffic_metrics[0]) == 0:
traffic_dictionary['Time Distribution'] = 'Exponential'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Average Packet Lambda'] = float(traffic_metrics[2])
parameters['Exponential Max Factor'] = float(traffic_metrics[3])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 4
elif int(traffic_metrics[0]) == 1:
traffic_dictionary['Time Distribution'] = 'Deterministic'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Average Packet Lambda'] = float(traffic_metrics[2])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 3
elif int(traffic_metrics[0]) == 2:
traffic_dictionary['Time Distribution'] = 'Uniform'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Min Packet Lambda'] = float(traffic_metrics[2])
parameters['Max Packet Lambda'] = float(traffic_metrics[3])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 4
elif int(traffic_metrics[0]) == 3:
traffic_dictionary['Time Distribution'] = 'Normal'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Average Packet Lambda'] = float(traffic_metrics[2])
parameters['Standard Deviation'] = float(traffic_metrics[3])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 4
elif int(traffic_metrics[0]) == 4:
traffic_dictionary['Time Distribution'] = 'OnOff'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Packets Lambda On'] = float(traffic_metrics[2])
parameters['Average Time Off'] = float(traffic_metrics[3])
parameters['Average Time On'] = float(traffic_metrics[4])
parameters['Exponential Max Factor'] = float(traffic_metrics[5])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 6
elif int(traffic_metrics[0]) == 5:
traffic_dictionary['Time Distribution'] = 'PPBP'
parameters = dict()
parameters['Equivalent Lambda'] = float(traffic_metrics[1])
parameters['Burst Gen Lambda'] = float(traffic_metrics[2])
parameters['Bit Rate'] = float(traffic_metrics[3])
parameters['Pare to Min Size'] = float(traffic_metrics[4])
parameters['Pare to Max Size'] = float(traffic_metrics[5])
parameters['Pare to Alpha'] = float(traffic_metrics[6])
parameters['Exponential Max Factor'] = float(traffic_metrics[7])
traffic_dictionary['Time Distribution Parameters'] = parameters
return 8
else:
return -1
"""
Retrieve the size distribution parameters of the traffic data.
@param traffic_metrics: list of all the flow traffic metrics to be processed
@param offset: number of elements read from the list of parameters data
@param traffic_dictionary: dictionary to fill out with the time distribution information
@return 0 if successful iteration, or -1 otherwise
"""
def create_traffic_size_distribution(self, traffic_metrics, offset, traffic_dictionary):
if int(traffic_metrics[offset]) == 0:
traffic_dictionary['Size Distribution'] = 'Deterministic'
parameters = dict()
parameters['Average Packet Size'] = float(traffic_metrics[offset + 1])
traffic_dictionary['Size Distribution Parameters'] = parameters
elif int(traffic_metrics[offset]) == 1:
traffic_dictionary['Size Distribution'] = 'Uniform'
parameters = dict()
parameters['Average Packet Size'] = float(traffic_metrics[offset + 1])
parameters['Min Size'] = float(traffic_metrics[offset + 2])
parameters['Max Size'] = float(traffic_metrics[offset + 3])
traffic_dictionary['Size Distribution Parameters'] = parameters
elif int(traffic_metrics[offset]) == 2:
traffic_dictionary['Size Distribution'] = 'Binomial'
parameters = dict()
parameters['Average Packet Size'] = float(traffic_metrics[offset + 1])
parameters['Packet Size 1'] = float(traffic_metrics[offset + 2])
parameters['Packet Size 2'] = float(traffic_metrics[offset + 3])
traffic_dictionary['Size Distribution Parameters'] = parameters
elif int(traffic_metrics[offset]) == 3:
traffic_dictionary['Size Distribution'] = 'Generic'
parameters = dict()
parameters['Average Packet Size'] = float(traffic_metrics[offset + 1])
parameters['Number of Candidates'] = float(traffic_metrics[offset + 2])
for i in range(0, int(traffic_metrics[offset + 2]) * 2, 2):
parameters['Size %d' % (i / 2)] = float(traffic_metrics[offset + 3 + i])
parameters['Prob %d' % (i / 2)] = float(traffic_dictionary[offset + 4 + i])
traffic_dictionary['Size Distribution Parameters'] = parameters
else:
return -1
return 0
"""
Get the maximum average lambda value and the string without it.
@param token: original string containing the maximum average lambda value and the rest of the string
@return maximum average lambda value
@return rest of string without maximum average lambda value
"""
def get_max_avg_lambda(self, token):
bar_index = token.find('|')
return token[0:bar_index], token[bar_index:].replace('|', '')
"""
Extract information from the input file containing information about simulation number, graph files, and routing files.
@param filepath: input file path
@return simulation numbers
@return list of MatrixPath objects showing routings, if they exist
"""
def process_input_file(self, filepath):
input_file = open(filepath)
sim_numbers = []
graph_files = []
routing_files = []
net_size = str(self.topology_size)
for line in input_file:
input_data = line.split(';')
sim_numbers.append(input_data[0])
filepath_stem = self.data_directory
graph_file = filepath_stem + net_size + '\graphs\\' + input_data[1]
graph_files.append(graph_file)
routing_file = (filepath_stem + net_size + '\\routings\\' + input_data[2]).rstrip()
routing_files.append(routing_file)
input_file.close() # Close file once done processing
# Create routing matrix
routing_matrices = []
for graph, route in zip(graph_files, routing_files):
routing_matrix = self.create_routing_matrix(int(net_size), route)
routing_matrices.append(routing_matrix)
return sim_numbers, routing_matrices
"""
Create an adjacency matrix from the routing matrices.
@param routing_matrices: MatrixPath structure of routing information
@return adjacency matrix of routes
"""
def process_routing_matrix(self, routing_matrices):
adj_matrix = np.zeros(())
for i in range(0, self.topology_size, 1):
for j in range(0, self.topology_size, 1):
for k in routing_matrices[i][j]:
if k:
# Populate the adjacency matrix
src = k[0]
dest = k[-1]
route = k[1:-1]
return None
"""
Creating a routing object to show an n x n matrix and how each cell [i, j] contains the path to go from node i
to node j. If 'None' value is present, no route is available.
@param net_size: size of topology network
@param routing_file: file regarding information about routing configuration
@return MatrixPath object that shows routes between different nodes, if present
"""
def create_routing_matrix(self, net_size, routing_file):
MatrixPath = np.empty((net_size, net_size), dtype = object)
with open(routing_file) as rf:
for line in rf:
nodes = line.split(';')
nodes = list(map(int, nodes))
MatrixPath[nodes[0], nodes[-1]] = nodes
return (MatrixPath)
"""
Create and return a graph readable structure for a specified GML markup.
@param filepath: directory of GML markup files for a topology size
@return graph structure with information about nodes and edges
"""
def graph_process(self, filepath):
graphs_dictionary = dict()
for topology_file in os.listdir(filepath):
G = networkx.read_gml(filepath + '/' + topology_file, destringizer = int)
#networkx.draw(G, with_labels = True, font_weight = 'bold')
graphs_dictionary[topology_file] = G
# Nodes of graph
nodes = G.nodes
# Topology edges
edges = G.edges # Returns list of tuples describing topology edges
# Information parameters related to a node
for i in nodes:
graphs_dictionary[i] = nodes[i]
# Information about link parameters
for i in edges: # (src node ID, dest node ID, link ID)
graphs_dictionary[i] = G[i[0]][i[1]][0]
return graphs_dictionary
"""
Given a simulation metrics file, extract information about the global packets, global losses, global delays, and
the metrics list for each instance.
@param filepath: simulation file of network topology
@return list of global packets measurements
@return list of global losses measurements
@return list of global delay measurements
@return list of dictionaries of metrics (performance matrix)
"""
def get_simulation_metrics(self, filepath):
sim_file = open(filepath)
global_packets_list = []
global_losses_list = []
global_delay_list = []
metrics_list = []
for line in sim_file:
measurement = line.split()
measurement_tokens = measurement[0].split(',')
global_packets = measurement_tokens[0] # Get global packets value
global_packets_list.append(global_packets)
measurement_tokens.remove(global_packets) # Remove value once finished
global_losses = measurement_tokens[0] # Get global losses value
global_losses_list.append(global_losses)
measurement_tokens.remove(global_losses) # Remove value once finished
global_delay_temp = measurement_tokens[0] # Get global delay value and modify
bar_index = global_delay_temp.find('|')
global_delay = global_delay_temp[0:bar_index]
global_delay_list.append(global_delay)
metrics_list_ = self.create_simulation_list(global_packets, global_losses, global_delay, measurement_tokens) # Get the rest of the list metrics
metrics_list.append(metrics_list_)
sim_file.close() # Close the file once done processing
return global_packets_list, global_losses_list, global_delay_list, metrics_list
"""
Extracting the individual list metrics and appending them all to one array.
@param global_delay: used to remove before iterating through the measurement tokens
@param measurement_tokens: array of metrics that are separated by '|' and ';'
@return list of dictionaries of metrics
"""
def create_simulation_list(self, global_packet, global_loss, global_delay, measurement_tokens):
metrics_list = []
modified_measurements = self.modify_tokens(measurement_tokens)
if global_delay in modified_measurements:
modified_measurements.remove(global_delay)
# Get the rest of the simulation tokens
for i in range(0, len(modified_measurements) - 11, 11):
metric_aggregated_dictionary = dict() # Re-initialize dictionary to hold the metrics
# Add global values
metric_aggregated_dictionary['Global Packet'] = global_packet
metric_aggregated_dictionary['Global Loss'] = global_loss
metric_aggregated_dictionary['Global Delay'] = global_delay
# Bandwidth
bandwidth = modified_measurements[i]
metric_aggregated_dictionary['Average Bandwidth'] = bandwidth
# Number of packets transmitted
number_packets_transmitted = modified_measurements[i + 1]
metric_aggregated_dictionary['Packets Transmitted'] = number_packets_transmitted
# Number of packets dropped
number_packets_dropped = modified_measurements[i + 2]
metric_aggregated_dictionary['Packets Dropped'] = number_packets_dropped
# Average per-packet delay
avg_delay = modified_measurements[i + 3]
metric_aggregated_dictionary['Average Per-Packet Delay'] = avg_delay
# Neperian logarithm of per-packet delay
neperian_logarithm = modified_measurements[i + 4]
metric_aggregated_dictionary['Neperian Logarithm'] = neperian_logarithm
# Percentile 10 of per-packet delay
percentile_10 = modified_measurements[i + 5]
metric_aggregated_dictionary['Percentile 10'] = percentile_10
# Percentile 20 of per-packet delay
percentile_20 = modified_measurements[i + 6]
metric_aggregated_dictionary['Percentile 20'] = percentile_20
# Percentile 50 of per-packet delay
percentile_50 = modified_measurements[i + 7]
metric_aggregated_dictionary['Percentile 50'] = percentile_50
# Percentile 80 of per-packet delay
percentile_80 = modified_measurements[i + 8]
metric_aggregated_dictionary['Percentile 80'] = percentile_80
# Percentile 90 of per-packet delay
percentile_90 = modified_measurements[i + 9]
metric_aggregated_dictionary['Percentile 90'] = percentile_90
# Variance of per-packet delay (jitter)
variance_delay = modified_measurements[i + 10]
metric_aggregated_dictionary['Jitter'] = variance_delay
metrics_list.append(metric_aggregated_dictionary) # Append to master list of metrics
return metrics_list
"""
Modify the measurement tokens list so that are no '|' or ';' present, but all tokens are separated and present.
@param measurement_tokens: array of metrics that are separated by '|' and ';'
@return modified list in which each token is present in the list
"""
def modify_tokens(self, measurement_tokens):
modified_measurements = []
# Modify measurements so they can be read and organized
for token in measurement_tokens:
if '|' in token:
new_token = token.split('|')
for t in new_token:
modified_measurements.append(t)
elif ';' in token:
new_token = token.split(';')
for t in new_token:
modified_measurements.append(t)
else:
modified_measurements.append(token)
return modified_measurements
"""
Given a link usage file, extract information about various metrics present. If a link does not exist, populate the
list of metrics with all -1.
@param filepath: link usage file of source-destination pair
@return list of port statistics by each measure
"""
def get_link_usage_metrics(self, filepath):
link_file = open(filepath)
port_statistics_list = []
for line in link_file:
port_statistics = line.split(';')
for token in port_statistics:
port_statistics_individual = []
if token == '-1':
# Link does not exist
link_exists = False
port_statistics_individual.append(link_exists)
for i in range(8):
port_statistics_individual.append(-1)
port_statistics_list.append(port_statistics_individual)
elif token == '\n': # Go to next line at end of line
break
else:
tokens = token.split(',')
# The link exists
link_exists = True
port_statistics_individual.append(link_exists)
# Average utilization of the outgoing port
avg_utilization = tokens[0]
port_statistics_individual.append(avg_utilization)
# Average packet loss rate in the outgoing port
avg_packet_loss = tokens[1]
port_statistics_individual.append(avg_packet_loss)
# Average packet length of the packets transmitted through the outgoing port
avg_packet_length = tokens[2]
port_statistics_individual.append(avg_packet_length)
# Average utilization of the first queue of the outgoing port
avg_utilization_first = tokens[3]
port_statistics_individual.append(avg_utilization_first)
# Average packet loss rate in the first queue of the outgoing port
avg_packet_loss_rate = tokens[4]
port_statistics_individual.append(avg_packet_loss_rate)
# Average port occupancy (service and waiting queue) of the first queue of the outgoing port
avg_port_occupancy = tokens[5]
port_statistics_individual.append(avg_port_occupancy)
# Maximum queue occupancy of the first queue of the outgoing port
max_queue_occupancy = tokens[6]
port_statistics_individual.append(max_queue_occupancy)
# Average packet length of the packets transmitted through the first queue of the outgoing port
avg_packet_length_first = tokens[7]
port_statistics_individual.append(avg_packet_length_first)
port_statistics_list.append(port_statistics_individual)
link_file.close() # Close the file once done processing
return port_statistics_list
"""
Function for plotting the types of time and size distributions for a particular network topology.
@param list_of_traffic_measurements: list of dictionaries of time and size distributions of traffic data
@param topology_size: number of nodes in network
"""
def plot_size_dist_type(self, list_of_traffic_measurements, topology_size):
time_distribution_type = []
size_distribution_type = []
for traffic_measurement in list_of_traffic_measurements:
for key in traffic_measurement:
if key == 'Time Distribution':
time_distribution_type.append(traffic_measurement['Time Distribution'])
elif key == 'Size Distribution':
size_distribution_type.append(traffic_measurement['Size Distribution'])
else:
continue
time_keys = Counter(time_distribution_type).keys()
time_values = Counter(time_distribution_type).values()
size_keys = Counter(size_distribution_type).keys()
size_values = Counter(size_distribution_type).values()
# Create bar graph for time distribution
time_x_pos = [i for i, _ in enumerate(time_keys)]
plt.bar(time_x_pos, time_values, color = 'aquamarine')
plt.xlabel("Time Distribution Type")
plt.ylabel("Count")
plt.title("Time Distributions for Topology Size " + str(topology_size))
plt.xticks(time_x_pos, time_keys)
#plt.show()
plt.savefig(f'plots/time_distribution_{topology_size}.png')
# Create bar graph for size distribution
size_x_pos = [i for i, _ in enumerate(size_keys)]
plt.bar(size_x_pos, size_values, color = 'deeppink')
plt.xlabel("Size Distribution Type")
plt.ylabel("Count")
plt.title("Size Distributions for Topology Size " + str(topology_size))
plt.xticks(size_x_pos, size_keys)
#plt.show()
plt.savefig(f'plots/size_distribution_{topology_size}.png')
"""
Creates plots of various time distribution characters of traffic metrics.
@param list_of_traffic_measurements: list of dictionaries of time and size distributions of traffic data
@param topology_size: number of nodes in network
"""
def plot_traffic_time_characteristics(self, list_of_traffic_measurements, topology_size):
equivalent_lambda_exp = []
avg_packets_lambda_exp = []
exp_max_factor_exp = []
equivalent_lambda_det = []
avg_packets_lambda_det = []
equivalent_lambda_uniform = []
min_packet_lambda = []
max_packet_lambda = []
equivalent_lambda_normal = []
avg_packet_lambda_normal = []
std_dev_normal = []
equivalent_lambda_onoff = []
packets_lambda_on = []
avg_time_off = []
avg_time_on = []
exp_max_factor_onoff = []
equivalent_lambda_ppbp = []
burst_gen_lambda = []
bit_rate = []
pare_min_size = []
pare_max_size = []
pare_alpha = []
exp_max_factor_ppbp = []
for traffic_measurement in list_of_traffic_measurements:
if traffic_measurement['Time Distribution'] == 'Exponential':
# Collect exponential distribution characteristics
equivalent_lambda_exp.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
avg_packets_lambda_exp.append(traffic_measurement['Time Distribution Parameters']['Average Packet Lambda'])
exp_max_factor_exp.append(traffic_measurement['Time Distribution Parameters']['Exponential Max Factor'])
elif traffic_measurement['Time Distribution'] == 'Deterministic':
# Collect deterministic distribution characteristics
equivalent_lambda_det.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
avg_packets_lambda_det.append(traffic_measurement['Time Distribution Parameters']['Average Packet Lambda'])
elif traffic_measurement['Time Distribution'] == 'Uniform':
# Collect uniform distribution characteristics
equivalent_lambda_uniform.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
min_packet_lambda.append(traffic_measurement['Time Distribution Parameters']['Min Packet Lambda'])
max_packet_lambda.append(traffic_measurement['Time Distribution Parameters']['Max Packet Lambda'])
elif traffic_measurement['Time Distribution'] == 'Normal':
# Collect normal distribution characteristics
equivalent_lambda_normal.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
avg_packet_lambda_normal.append(traffic_measurement['Time Distribution Parameters']['Average Packet Lambda'])
std_dev_normal.append(traffic_measurement['Time Distribution Parameters']['Standard Deviation'])
elif traffic_measurement['Time Distribution'] == 'OnOff':
# Collect on-off distribution characteristics
equivalent_lambda_onoff.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
packets_lambda_on.append(traffic_measurement['Time Distribution Parameters']['Packets Lambda On'])
avg_time_off.append(traffic_measurement['Time Distribution Parameters']['Average Time Off'])
avg_time_on.append(traffic_measurement['Time Distribution Parameters']['Average Time On'])
exp_max_factor_onoff.append(traffic_measurement['Time Distribution Parameters']['Exponential Max Factor'])
elif traffic_measurement['Time Distribution'] == 'PPBP':
# Collect PPBP distribution characteristics
equivalent_lambda_ppbp.append(traffic_measurement['Time Distribution Parameters']['Equivalent Lambda'])
burst_gen_lambda.append(traffic_measurement['Time Distribution Parameters']['Burst Gen Lambda'])
bit_rate.append(traffic_measurement['Time Distribution Parameters']['Bit Rate'])
pare_min_size.append(traffic_measurement['Time Distribution Parameters']['Pare to Min Size'])
pare_max_size.append(traffic_measurement['Time Distribution Parameters']['Pare to Max Size'])
pare_alpha.append(traffic_measurement['Time Distribution Parameters']['Pare to Alpha'])
exp_max_factor_ppbp.append(traffic_measurement['Time Distribution Parameters']['Exponential Max Factor'])
else:
continue
# Plot exponential distribution statistics
if len(equivalent_lambda_exp) != 0:
plt.boxplot([equivalent_lambda_exp, avg_packets_lambda_exp, exp_max_factor_exp],
vert = True,
labels = ['Equivalent Lambda', 'Average Packets Lambda', 'Exponential Max Factor'],
meanline = True)
plt.title("Exponential (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/exp/{topology_size}/time_exponential_dist_statistics_{topology_size}.png')
# Plot deterministic distribution statistics
if len(equivalent_lambda_det) != 0:
plt.boxplot([equivalent_lambda_det, avg_packets_lambda_det],
vert = False,
labels = ['Equivalent Lambda', 'Average Packets Lambda'],
meanline = True)
plt.title("Deterministic (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/det/{topology_size}/time_deterministic_dist_statistics_{topology_size}.png')
# Plot uniform distribution statistics
if len(equivalent_lambda_uniform) != 0:
plt.boxplot([equivalent_lambda_uniform, min_packet_lambda, max_packet_lambda],
vert = False,
labels = ['Equivalent Lambda', 'Min Packet Lambda', 'Max Packet Lambda'],
meanline = True)
plt.title("Uniform (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/uniform/{topology_size}/time_uniform_dist_statistics_{topology_size}.png')
# Plot normal distribution statistics
if len(equivalent_lambda_normal) != 0:
plt.boxplot([equivalent_lambda_normal, avg_packet_lambda_normal, std_dev_normal],
vert = False,
labels = ['Equivalent Lambda', 'Average Packet Lambda', 'Standard Deviation'],
meanline = True)
plt.title("Normal (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/normal/{topology_size}/time_normal_dist_statistics_{topology_size}.png')
# Plot on-off distribution statistics
if len(equivalent_lambda_onoff) != 0:
plt.boxplot([equivalent_lambda_onoff, packets_lambda_on, avg_time_off, avg_time_on, exp_max_factor_onoff],
vert = False,
labels = ['Equivalent Lambda', 'Packets Lambda On', 'Average Time Off', 'Average Time On', 'Exponential Max Factor'],
meanline = True)
plt.title("On-off (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/onoff/{topology_size}/time_onoff_dist_statistics_{topology_size}.png')
# Plot PPBP distribution statistics
if len(equivalent_lambda_ppbp) != 0:
plt.boxplot([equivalent_lambda_ppbp, burst_gen_lambda, bit_rate, pare_min_size, pare_max_size, pare_alpha, exp_max_factor_ppbp],
vert = False,
labels = ['Equivalent Lambda', 'Burst Gen Lambda', 'Bit Rate', 'Pare to Min Size', 'Pare to Max Size', 'Pare to Alpha', 'Exponential Max Factor'],
meanline = True)
plt.title("PPBP (Time) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/time/ppbp/{topology_size}/time_ppbp_dist_statistics_{topology_size}.png')
"""
Creates plots of various size distribution characters of traffic metrics.
@param list_of_traffic_measurements: list of dictionaries of time and size distributions of traffic data
@param topology_size: number of nodes in network
"""
def plot_traffic_size_characteristics(self, list_of_traffic_measurements):
avg_packet_size_det = []
avg_packet_size_uniform = []
min_size = []
max_size = []
avg_packet_size_bi = []
packet_size_1 = []
packet_size_2 = []
avg_packet_size_generic = []
number_of_candidates = []
for traffic_measurement in list_of_traffic_measurements:
if traffic_measurement['Size Distribution'] == 'Deterministic':
# Collect deterministic distribution characteristics
avg_packet_size_det.append(traffic_measurement['Size Distribution Parameters']['Average Packet Size'])
elif traffic_measurement['Size Distribution'] == 'Uniform':
# Collect uniform distribution characteristics
avg_packet_size_uniform.append(traffic_measurement['Size Distribution Parameters']['Average Packet Size'])
min_size.append(traffic_measurement['Size Distribution Parameters']['Min Size'])
max_size.append(traffic_measurement['Size Distribution Parameters']['Max Size'])
elif traffic_measurement['Size Distribution'] == 'Binomial':
# Collect binomial distribution characteristics
avg_packet_size_bi.append(traffic_measurement['Size Distribution Parameters']['Average Packet Size'])
packet_size_1.append(traffic_measurement['Size Distribution Parameters']['Packet Size 1'])
packet_size_2.append(traffic_measurement['Size Distribution Parameters']['Packet Size 2'])
elif traffic_measurement['Size Distribution'] == 'Generic':
# Collect generic distribution characteristics
avg_packet_size_generic.append(traffic_measurement['Size Distribution Parameters']['Average Packet Size'])
number_of_candidates.append(traffic_measurement['Size Distribution Parameters']['Number of Candidates'])
else:
continue
# Plot deterministic distribution statistics
if len(avg_packet_size_det) != 0:
plt.boxplot(avg_packet_size_det,
vert = False,
labels = 'Average Packet Size',
meanline = True)
plt.title("Deterministic (Size) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/size/det/{topology_size}/size_deterministic_dist_statistics_{topology_size}.png')
# Plot uniform distribution statistics
if len(avg_packet_size_uniform) != 0:
plt.boxplot([avg_packet_size_uniform, min_size, max_size],
vert = False,
labels = ['Average Packet Size', 'Min Size', 'Max Size'],
meanline = True)
plt.title("Uniform (Size) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/size/uniform/{topology_size}/size_uniform_dist_statistics_{topology_size}.png')
# Plot binomial distribution statistics
if len(avg_packet_size_bi) != 0:
plt.boxplot([avg_packet_size_bi, packet_size_1, packet_size_2],
vert = False,
labels = ['Average Packet Size', 'Packet Size 1', 'Packet Size 2'],
meanline = True)
plt.title("Binomial (Size) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/size/bi/{topology_size}/size_binomial_dist_statistics_{topology_size}.png')
# Plot generic distribution statistics
if len(avg_packet_size_generic) != 0:
plt.boxplot([avg_packet_size_generic, number_of_candidates],
vert = False,
labels = ['Average Packet Size', 'Number of Candidates'],
meanline = True)
plt.title("Generic (Size) Statistics for Topology Size " + str(topology_size))
plt.savefig(f'plots/size/gen/{topology_size}/size_generic_dist_statistics_{topology_size}.png')
"""
Extract contents of tar files downloaded.
@param filename: path and name of the tar file (.tar)
@param path: where data should be located
"""
def extract(filename, path):
if filename.endswith("tar.gz"):
tar = tarfile.open(filename, "r:gz")
tar.extractall(path = path)
print("Path 1: " + filename + " unzipped.")
tar.close()
return
elif filename.endswith("tar"):
tar = tarfile.open(filename, "r:")
tar.extractall(path = path)
print("Path 2: " + filename + " unzipped.")
tar.close()
return
else:
print("Not a tar file.")
return
"""
Within a directory, extract all the contents within all the tar files in that directory.
Print the progress of the extraction.
@param main_filepath: file path that contains all the tar files
"""
def extract_all_in_filepath(main_filepath):
files = os.listdir(main_filepath)
files_length = len(files)
extraction_progress = 0
for tar_file in os.listdir(main_filepath):
if tar_file.endswith('.gz'): # Tar files only
# All the names end with 'tar.gz' in this path
new_directory_path = main_filepath + tar_file.replace('tar.gz', '')
os.mkdir(new_directory_path)
extract(main_filepath + tar_file, new_directory_path)
extraction_progress = extraction_progress + 1
print("Unzipped " + str(extraction_progress) + " out of " + str(files_length) + " files.")
"""
Create the NetworkInput object for the training data.
@param data_directory: path to where training files are located
"""
def process_training_data(data_directory):
# Iterate through directories, subdirectories, and files for training dataset
for root, directories, file_list in os.walk(data_directory):
topology_size = None # Get the network topology size
input_file = None # Get the input filepath
traffic_file = None # Get the traffic filepath
link_file = None # Get the link usage filepath
sim_file = None # Get the simulation results filepath
output_name = None # Get the name of the .csv file to be created
try:
topology_size = int(root.replace(data_directory, "")[0:2])
graph_files = data_directory + str(topology_size) + '\graphs\\' # Get route to graph files directory
for name in file_list: # Retrieve the rest of the files
file_name = os.path.join(root, name)
if 'input_files.txt' in file_name.strip():
input_file = file_name
token_list = file_name.split('\\')
output_name = token_list[3]
print(output_name)
elif 'traffic.txt' in file_name.strip():
traffic_file = file_name
elif 'linkUsage.txt' in file_name.strip():
link_file = file_name
elif 'simulationResults.txt' in file_name.strip():
sim_file = file_name
if None not in (topology_size, input_file, traffic_file, link_file, graph_files, sim_file, output_name):
# Check to see if file already exists. If it does, skip processing this section of data.
if os.path.isfile('tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'):
print(output_name + '.csv already exists.')
continue
# Create the NetworkInput object if the .csv file does not exist for that section of data.
else:
print("Creating NetworkInput object.")
dataset = NetworkInput(data_directory,
topology_size,
input_file,
traffic_file,
link_file,
graph_files,
sim_file,
output_name)
print("Creating " + output_name + ".csv.")
dataset.write_to_csv()
print("Finished creating " + output_name + ".csv.")
except ValueError:
print("Empty string encountered. Continuing data processing.")
except TypeError:
print("NoneType encountered. Continuing data processing.")
"""
Create the NetworkInput object for the validation and test data.
@param data_directory: path to where validation or test files are located
"""
def process_validation_data(data_directory):
# Iterate through directories, subdirectories, and files for training dataset
for root, directories, file_list in os.walk(data_directory):
topology_size = None # Get the network topology size
input_file = None # Get the input filepath
traffic_file = None # Get the traffic filepath
link_file = None # Get the link usage filepath
sim_file = None # Get the simulation results filepath
output_name = None # Get the name of the .csv file to be created
if 'results' in root:
name = root.replace(data_directory, "")
token_list = name.split('\\')
topology_size = int(token_list[0])
output_name = token_list[1]
if (output_name + '\\' + output_name) in root:
for file_list_ in os.walk(root):
for item in file_list_: # Tuple iteration
if type(item) == list and len(item) > 1:
input_file = root + '\\' + item[0]
link_file = root + '\\' + item[1]
sim_file = root + '\\' + item[2]
traffic_file = root + '\\' + item[4]
graph_files = data_directory + str(topology_size) + '\graphs\\' # Get route to graph files directory
if None not in (topology_size, input_file, traffic_file, link_file, graph_files, sim_file, output_name):
# Check to see if file already exists. If it does, modify the file name.
csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(csv_path):
print(output_name + '.csv already exists. Trying to modify the name.')
if 'validation' in data_directory and '-2' in data_directory:
output_name = output_name + '-2'
new_csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(new_csv_path):
print(output_name + ' already exists as a validation file.')
continue
elif 'validation' in data_directory and '-3' in data_directory:
output_name = output_name + '-3'
new_csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(new_csv_path):
print(output_name + ' already exists as a validation file.')
continue
elif 'test' in data_directory and '-1' in data_directory:
output_name = output_name + '-test-1'
new_csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(new_csv_path):
print(output_name + ' already exists as a test file.')
continue
elif 'test' in data_directory and '-2' in data_directory:
output_name = output_name + '-test-2'
new_csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(new_csv_path):
print(output_name + ' already exists as a test file.')
continue
elif 'test' in data_directory and '-3' in data_directory:
output_name = output_name + '-test-3'
new_csv_path = 'tabular_data\\' + str(topology_size) + '\\' + output_name + '.csv'
if os.path.isfile(new_csv_path):
print(output_name + ' already exists as a test file.')
continue
print("Creating NetworkInput object.")
dataset = NetworkInput(data_directory,
topology_size,
input_file,
traffic_file,
link_file,
graph_files,
sim_file,
output_name)
print("Creating " + output_name + ".csv.")
dataset.write_to_csv()
print("Finished creating " + output_name + ".csv.")
# Create the NetworkInput object if the .csv file does not exist for that section of data.
else:
print("Creating NetworkInput object.")
dataset = NetworkInput(data_directory,
topology_size,
input_file,
traffic_file,
link_file,
graph_files,
sim_file,
output_name)
print("Creating " + output_name + ".csv.")
dataset.write_to_csv()
print("Finished creating " + output_name + ".csv.")
# Driver code
if __name__ == "__main__":
#extract('gnnet-ch21-dataset-test-with-labels.tar.gz', 'test_data\\')
#for i in range(200, 320, 20):
#extract_all_in_filepath('test_data\gnnet-ch21-dataset-test-with-labels\ch21-test-with-labels-setting-1\\' + str(i) +'\\')
#extract_all_in_filepath('test_data\gnnet-ch21-dataset-test-with-labels\ch21-test-with-labels-setting-2\\' + str(i) +'\\')
#extract_all_in_filepath('test_data\gnnet-ch21-dataset-test-with-labels\ch21-test-with-labels-setting-3\\' + str(i) +'\\')
# Delete all files with particular .tar.gz extension
#directory = 'training_data\\'
#directory = 'validation_data\\'
#directory = 'test_data\\'
#for root, directories, file_list in os.walk(directory):
# for item in file_list:
# if item.endswith('tar.gz'):