-
Notifications
You must be signed in to change notification settings - Fork 0
/
loc2yaml
executable file
·769 lines (668 loc) · 30.5 KB
/
loc2yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
#!/usr/bin/env python
# loc2yaml, CSV to YAML converter for UN/LOCODE data
# https://github.com/sergeuz/locode
#
# Dependencies:
# Python 3.2 or higher;
# pyyaml, https://bitbucket.org/xi/pyyaml
#
# This file is subject to the terms and conditions defined in LICENSE file,
# which is part of this source code package.
import yaml, csv, tempfile, datetime, getopt, re, os, sys
from collections import namedtuple
import locode
from locode import print_q, print_v
class Config:
def __init__(self):
# Default settings
self.src_files = []
self.dest_path = locode.DEFAULT_OUTPUT_BASENAME
self.dest_is_file = False
self.ctry_codes = set() # Country codes
self.replace = False
self.merge = False
self.simplify = False
self.no_obs = False
self.no_diacr = False
self.add_info = False
self.safe_args = []
self.safe_src_files = []
# Helper structures
YamlCityProp = namedtuple('YamlCityProp', ['name', 'region', 'flags'])
CsvLocationProp = namedtuple('CsvLocationProp', ['name', 'region'])
UpdateStat = namedtuple('Stat', ['added', 'deleted', 'moved', 'conflict', 'todo'])
# Current settings
cfg = Config()
# Some precompiled regular expressions
re_cut = [
re.compile(r"\(.*?\)")] # Text within and including parentheses
re_skip = [ # See UN/LOCODE manual
re.compile(r"\sPt$", re.IGNORECASE), # Port
re.compile(r"\sApt$", re.IGNORECASE), # Airport
re.compile(r"\sFPSO$", re.IGNORECASE)] # Floating Production Storage and Offloading (not in manual)
re_skip_more = [ # These ones are not stated in manual
re.compile(r"\sPort$", re.IGNORECASE),
re.compile(r"\sAirport$", re.IGNORECASE),
re.compile(r"\sTerminal$", re.IGNORECASE)]
re_repl = [ # See UN/LOCODE manual
(re.compile(r"\bI\.\b", re.IGNORECASE), "Island"),
(re.compile(r"\bPto\b", re.IGNORECASE), "Puerto"),
(re.compile(r"\bSt\b", re.IGNORECASE), "Saint")]
re_dash = re.compile(r"\s-")
def print_usage():
print("""CSV to YAML converter for UN/LOCODE data, version {version}
{homepage}
Usage:
loc2yaml [-c code] [-o path] [-s] file.csv [file.csv ...]
loc2yaml -m -o output.yaml file.yaml [file.yaml ...]
Arguments:
-c code[,code[,...]]
--country=code[,code[,...]]
Generate data for specified country codes only. Two-letter codes as
defined in ISO 3166-1 are recognized.
-o path
--output=path
Use specified directory for generated files, one YAML file per country.
If path points to .yaml file (existent or not), generated data for all
counties will be stored into single file.
-r
--replace
Replace destination files rather than try to update them.
-m
--merge
Merge data from separate YAML files into single file. Input files are
treated as YAML files in this mode.
-s
--simplify
Enable various heuristics in order to simplify location names, filter
out certain entries, such as transport nodal points and unverified
locations.
--no-obsolete
Remove obsolete entries while updating existent YAML files.
--no-diacritic
Use location names without diacritic signs.
--add-info
Put description of document structure into header of generated YAML
files. Can be useful for those who will edit these files manually.
-v
--verbose
Print various statistics while processing data.
-q
--quiet
Suppress any normal output.
--version
Show version string.
-h
--help
Show this message.
Examples:
loc2yaml unlocode.csv
Process all the data of 'unlocode.csv' file and put generated files
into default '{basename}' directory, creating it in current directory if
necessary.
loc2yaml -c us,ca -o path/to/yaml/files unlocode.csv
Process only data related to USA and Canada, putting generated files
into specified directory.
loc2yaml -o world.yaml *.csv
Generate single YAML file for all CSV files found in current directory.
Note that loc2yaml doesn't support wildcard file names natively.
loc2yaml -m -r -o world.yaml *.yaml
Merge all YAML files found in current directory into single file. All
content of destination file will be overwritten (see -r flag).
""".format(
version=locode.VERSION_STRING,
homepage=locode.PROJECT_HOMEPAGE,
basename=locode.DEFAULT_OUTPUT_BASENAME))
def parse_cmd_args(argv):
global cfg
args, cfg.src_files = getopt.getopt(argv[1:], \
"c:o:rmsvqh", \
["country=", "output=", "replace", "merge", "simplify", "verbose", "quiet", "help", \
"no-obsolete", "no-diacritic", "add-info", "version"]) # No short alternatives
# Arguments we don't want to disclose, see save_yml_file()
param_unsafe = ["-o", "--output", "-v", "--verbose", "-q", "--quiet"]
# Arguments not supported in merge mode. Just to warn those who expects the opposite
param_no_merge = ["-s", "--simplify", "--no-obsolete", "--no-diacritic"]
for param, val in args:
if param in ("-h", "--help"):
print_usage();
sys.exit();
elif param == "--version":
print(locode.VERSION_STRING);
sys.exit();
elif param in ("-c", "--country"):
for code in val.split(','):
code = code.strip()
if len(code) == 2:
cfg.ctry_codes.add(code.upper())
elif code:
sys.stderr.write("Warning: \"{}\" doesn't look like ISO 3166-1 country code\n".format(code))
elif param in ("-o", "--output"):
cfg.dest_path = val
elif param in ("-r", "--replace"):
cfg.replace = True
elif param in ("-m", "--merge"):
cfg.merge = True
elif param in ("-s", "--simplify"):
cfg.simplify = True
elif param == "--no-obsolete":
cfg.no_obs = True
elif param == "--no-diacritic":
cfg.no_diacr = True
elif param == "--add-info":
cfg.add_info = True
elif param in ("-v", "--verbose"):
locode.verbose = True # See locode.py
elif param in ("-q", "--quiet"):
locode.quiet = True # See locode.py
else:
assert False
# See save_yml_file()
if param not in param_unsafe:
cfg.safe_args.append(param)
if val:
cfg.safe_args.append(val)
if cfg.merge:
for param, _ in args:
if param in param_no_merge:
sys.stderr.write("Warning: \"{}\" has no effect in merge mode\n".format(param))
# Converting source files to absolute paths
for i, v in enumerate(cfg.src_files):
cfg.src_files[i] = os.path.abspath(v)
cfg.safe_src_files.append(os.path.basename(v)) # Hide path
if len(cfg.src_files) > 1:
print_q("Source files:")
for file_name in cfg.src_files:
print_q(" {}".format(file_name))
elif len(cfg.src_files) == 1:
print_q("Source file:", cfg.src_files[0])
else:
raise RuntimeError("No input files specified.\nTry 'loc2yaml --help' for list of supported options.")
# Full path to destination file or directory
cfg.dest_path = os.path.abspath(cfg.dest_path)
if cfg.merge or os.path.isfile(cfg.dest_path) or cfg.dest_path.endswith(".yaml"):
if not cfg.dest_path.endswith(".yaml"):
cfg.dest_path += ".yaml"
print_q("Destination file:", cfg.dest_path)
cfg.dest_is_file = True;
else:
print_q("Destination directory:", cfg.dest_path)
# Country filter
if cfg.ctry_codes:
print_q("Country code(s):", ", ".join(sorted(cfg.ctry_codes)))
else:
print_q("Using all country codes")
def simplify_name(orig_name):
loc_name = orig_name
# Skipping names with sublocations, e.g. "Astoria/Queens/New York"
if loc_name.find('/') > -1:
return ''
# Assuming space-separated dash can't be a part of regular locality name
if re_dash.search(loc_name) != None:
return ''
# Removing any remarks and clarifications:
# "Saint Petersburg (ex Leningrad)" -> "Saint Petersburg"
for re in re_cut:
loc_name = re.sub('', loc_name)
loc_name = locode.simplify_str(loc_name)
# Always using last name met among possible renamings and alternatives.
# Sometimes this seems to give an advance to French namings in case of
# Canada, for example, but who said format is perfect...
# "Peking = Beijing" -> "Beijing"
loc_names = loc_name.split('=')
if len(loc_names) > 1:
loc_name = locode.simplify_str(loc_names[-1])
# Some names consist of comma-separated parts and situations when this
# rule is applied seem to be completely random regardless of what is
# said in manual. Here we just merging such parts
loc_name = loc_name.replace(',', '')
loc_name = locode.simplify_str(loc_name)
# Skipping subordinate entries, e.g. "London-Heathrow Apt" (for what
# they're used if there are also sublocation entries?). These ones are
# separated with just hyphen ('-') which makes it impossible to filter
# out such locations completely as hyphen can also be a part of regular
# city name. What we can do here is filter out locations related to
# airports, for example
if loc_name.find('-') > -1:
# Skipping entry if transport node indentifier (defined in manual),
# is met as part of any name's section
loc_names = loc_name.split('-')
for name in loc_names:
for re in re_skip:
if re.search(name) != None:
return ''
# Our custom filter check is applied only to last section of name
# (there are a lot of useful "Port-Whatever" locations)
for re in re_skip_more:
if re.search(loc_names[-1]) != None:
return ''
else:
# Regular location name: checking only for "official" identifiers,
# which we recognize to skip certain entries
for re in re_skip:
if re.search(loc_name) != None:
return ''
# Expanding abbreviations
for re, repl in re_repl:
loc_name = re.sub(repl, loc_name)
return loc_name
def parse_csv_file(file_name, out_dict):
# Skipped, duplicated, orphane and other suspicious entries
skipped = {}
dups = {}
orphs = {}
renames = {}
print_q("Processing file:", file_name)
f = open(file_name, encoding="cp1252", newline='')
reader = csv.reader(f)
# See http://www.unece.org/fileadmin/DAM/cefact/locode/unlocode_manual.pdf
for row in reader:
if len(row) != 12:
raise RuntimeError("Invalid number of fields, line: {}".format(reader.line_num))
# Country code
ctry_code = row[1].strip().upper()
if cfg.ctry_codes and ctry_code not in cfg.ctry_codes:
continue
# Location name
if cfg.no_diacr:
loc_name = locode.simplify_str(row[4]) # Name without diacritic signs
else:
loc_name = locode.simplify_str(row[3])
region_code = row[5].strip().upper() # Subdivision code
loc_code = row[2].strip().upper() # Location code
status = row[7].strip().casefold() # Approval status
# Skipping incomplete entries (usually act as delimiters)
if not ctry_code or not loc_code or not loc_name:
continue
orphane = False
if not region_code:
region_code = locode.UNKNOWN_REGION_CODE
orphane = True
full_code = ctry_code + ' ' + region_code + ' ' + loc_code
orig_name = loc_name
if cfg.simplify:
# Simplifying location name
loc_name = simplify_name(orig_name)
if (not loc_name or
status == "xx" or # Will be removed from the next issue of UN/LOCODE
status == "ur" or # Included on user's request, not officially approved
status == "rr"): # Request rejected
skipped.setdefault(full_code, []).append(orig_name)
continue
if orig_name.find(',') > -1:
# Names which originally consisted of comma-separated parts are
# likely subjects for further manual renaming
renames.setdefault(full_code, []).append(loc_name)
region_dict = out_dict.setdefault(ctry_code, {})
loc_dict = region_dict.setdefault(region_code, {})
if loc_code in loc_dict:
# Prefer shorter name among duplicate entries
loc_cur_name = loc_dict[loc_code]
if len(loc_name) < len(loc_cur_name):
loc_dict[loc_code] = loc_name
name_list = dups.setdefault(full_code, [])
if not name_list:
name_list.append(loc_cur_name)
name_list.append(loc_name)
else:
loc_dict[loc_code] = loc_name
if orphane:
orphs[full_code] = loc_dict[loc_code]
f.close()
# Various statistics for further manual processing
if skipped:
print_v("Skipped entries (--simplify enabled):")
for code, names in sorted(skipped.items()):
print_v(" {}: {}".format(code, "; ".join(sorted(names))))
if renames:
print_v("Consider manual renaming (--simplify enabled):")
for code, names in sorted(renames.items()):
print_v(" {}: {}".format(code, "; ".join(sorted(names))))
if orphs:
print_v("Orphane entries (no region specified):")
for code, name in sorted(orphs.items()):
print_v(" {}: {}".format(code, name))
if dups:
print_v("Duplicate entries (shorter names preferred):")
for code, names in sorted(dups.items()):
print_v(" {}: {}".format(code, "; ".join(sorted(names))))
def update_yml_data(yml_root, csv_data):
# Update statistics
stat_add = []
stat_del = []
stat_moved = []
stat_conf = []
stat_todo = False
yml_ctry_root = yml_root["country"]
for csv_ctry_code, csv_region_root in csv_data.items():
# YAML's city properties for current country
yml_city_prop = {}
yml_ctry = yml_ctry_root.setdefault(csv_ctry_code, {})
yml_region_root = yml_ctry.setdefault("region", {})
for yml_region_code, yml_region in yml_region_root.items():
if "city" not in yml_region:
continue
for yml_city_code, yml_city in yml_region["city"].items():
# City name (can be specified in different ways)
name = ''
if type(yml_city) == type({}):
if "name" in yml_city:
yml_city_name = yml_city["name"]
if type(yml_city_name) == type({}):
if "default" in yml_city_name:
# Default translation:
# MOW:
# name:
# default: Moscow
name = yml_city_name["default"]
elif type(yml_city_name) == type(''):
# Separate element:
# MOW:
# name: Moscow
name = yml_city_name
elif type(yml_city) == type(''):
# In-place naming:
# MOW: Moscow
name = yml_city
# Hint flags for loc2yaml, e.g. ".loc2yaml: preserve"
flags = set()
if type(yml_city) == type({}) and locode.PARSER_HINT_TAG in yml_city:
for flag in yml_city[locode.PARSER_HINT_TAG].split(','):
flag = flag.strip().casefold()
if flag:
flags.add(flag)
# Updating city properties table
yml_city_prop[yml_city_code] = YamlCityProp(name=name, region=yml_region_code, flags=flags)
# Providing placeholder for default country name if necessary
if "name" not in yml_ctry or not yml_ctry["name"]:
# TODO: Actually original CSV files seem to contain country names in English
yml_ctry["name"] = csv_ctry_code.capitalize() + ' ' + locode.TODO_MARKER
stat_todo = True # Some names need to be provided externally
# Adding new locations; filling default names
csv_loc_prop = {}
for csv_region_code, csv_city_root in csv_region_root.items():
yml_region = yml_region_root.setdefault(csv_region_code, {})
if "name" not in yml_region or not yml_region["name"]:
if csv_region_code == locode.UNKNOWN_REGION_CODE:
yml_region["name"] = "Unknown " + locode.TODO_MARKER
else:
yml_region["name"] = csv_region_code.capitalize() + ' ' + locode.TODO_MARKER # Cannot know full region name
stat_todo = True
yml_city_root = yml_region.setdefault("city", {})
for csv_loc_code, csv_loc_name in csv_city_root.items():
# Updating location properties table
csv_loc_prop[csv_loc_code] = CsvLocationProp(name=csv_loc_name, region=csv_region_code)
if csv_loc_code in yml_city_root:
# Checking if existent location has default name associated
yml_city = yml_city_root[csv_loc_code]
if type(yml_city) == type({}):
if "name" in yml_city:
yml_city_name = yml_city["name"]
if type(yml_city_name) == type({}):
if "default" not in yml_city_name or not yml_city_name["default"]:
yml_city_name["default"] = csv_loc_name # Default translation
elif type(yml_city_name) == type('') and not yml_city_name or type(yml_city_name) == type(None):
yml_city_name = csv_loc_name # Separate element
else:
yml_city["name"] = csv_loc_name # Separate element
elif type(yml_city) == type('') and not yml_city or type(yml_city) == type(None):
yml_city = csv_loc_name # In-place naming
elif csv_loc_code in yml_city_prop:
# Location exists in YAML but belongs to different region
yml_city = yml_city_prop[csv_loc_code]
full_code_yml = csv_ctry_code + ' ' + yml_city.region + ' ' + csv_loc_code
full_code_csv = csv_ctry_code + ' ' + csv_region_code + ' ' + csv_loc_code
if yml_city.region == locode.UNKNOWN_REGION_CODE:
if locode.PARSER_HINT_PRESERVE not in yml_city.flags:
# Moving YAML's city to another region only if its current region is not specified
yml_city_root[csv_loc_code] = yml_region_root[yml_city.region]["city"].pop(csv_loc_code)
stat_moved.append((full_code_yml, full_code_csv, yml_city.name)) # Update statistics
elif csv_region_code != locode.UNKNOWN_REGION_CODE:
# CSV and YAML contain same location in different regions
stat_conf.append((full_code_yml, full_code_csv, yml_city.name))
else:
# Adding new location
yml_city_root[csv_loc_code] = csv_loc_name # In-place naming
full_code = csv_ctry_code + ' ' + csv_region_code + ' ' + csv_loc_code
stat_add.append((full_code, csv_loc_name)) # Update statistics
# Removing obsolete locations if requested
if cfg.no_obs:
for yml_city_code, yml_city in yml_city_prop.items():
if yml_city_code not in csv_loc_prop and locode.PARSER_HINT_PRESERVE not in yml_city.flags:
del yml_region_root[yml_city.region]["city"][yml_city_code]
full_code = csv_ctry_code + ' ' + yml_city.region + ' ' + yml_city_code
stat_del.append((full_code, yml_city.name)) # Update statistics
return UpdateStat(added=stat_add, deleted=stat_del, moved=stat_moved, conflict=stat_conf, todo=stat_todo)
def save_yml_file(yml_root, file_name):
f = open(file_name, 'w', encoding="utf-8", newline='\n')
# Header text
f.write("""# Generated by loc2yaml {version},
# CSV to YAML converter for UN/LOCODE data:
# {homepage}
#
# Last updated: {timestamp} (UTC)
# Command-line arguments: {args}
# Source files: {files}
#
# UN/LOCODE homepage:
# http://www.unece.org/cefact/locode/welcome.html
""".format(
version=locode.VERSION_STRING,
homepage=locode.PROJECT_HOMEPAGE,
timestamp=datetime.datetime.utcnow().strftime("%Y-%m-%d, %H:%M:%S"),
args=' '.join(cfg.safe_args),
files=";\n# ".join(cfg.safe_src_files)))
# Some hints for those who will update file manually
if cfg.add_info:
f.write("""#
# Document structure (http://en.wikipedia.org/wiki/YAML):
#
# 01 country:
# 02 'NO':
# 03 name:
# 04 default: Norge
# 05 en: Norway
# 06 region:
# 07 '01':
# 08 name: Østfold
# 09 city:
# 10 FRK: Fredrikstad
# 11 SPG: Sarpsborg
# 12 '02':
# 13 name: 02 {todo}
# 14 {no_region}:
# 15 city:
# 16 .EMR:
# 17 name: Emerald City
# 18 {hint_tag}: {hint_preserve}, {hint_odd}
#
# Format notes:
#
# * Country, region and city codes are case-sensitive (lines 2, 7, 10, 11, 12,
# 16), loc2yaml expects them to be in upper case to update files correctly.
# Reserved words such as 'name', 'region', etc. should be in lower case.
# * Country codes should comply with 2-letter codes defined in ISO 3166-1
# standard (line 2).
# * Custom region codes are formally allowed, though should be used only if
# absolutely necessary. Ensure there are no intersections with codes defined
# in ISO 3166-2 for country subdivisions (lines 7, 12, 14). Note that these
# standard subdivision codes are used without redundant country prefix.
# * Custom city codes are allowed for entries missing in source UN/LOCODE data.
# In above context term 'city' may correspond to any locality within grouping
# region (lines 10, 11, 16).
# * loc2yaml groups orphane localities using special '{no_region}' region code
# (line 14). In most if not all cases such entries should be moved to proper
# region manually.
# * There are several ways to provide names for countries, regions and cities.
# In-place naming, where entry name follows its code right at the same line,
# is supported only for cities (lines 10, 11).
# * Translations to different languages can be provided by expanding 'name'
# element (lines 3-5). It's recommended to use ISO 639-1 language codes and
# always provide a "default" translation.
# * When loc2yaml is unable to provide full name, e.g. region names in newly
# generated files, it uses placeholder name based on locality code with
# '{todo}' suffix added to it (line 13).
# * Comma-separated list of parser hints (described below) can be provided for
# loc2yaml to alter its behavior when updating certain entries (line 18).
#
# Parser hints:
#
# * {hint_preserve}
# Protects certain entry from deletion when loc2yaml is invoked to update
# existent YAML file and source UN/LOCODE data doesn't contain such entry
# anymore. Usually applied to custom entries when loc2yaml is expected to be
# used with --no-obsolete argument.
# * {hint_odd}
# Not recognized by loc2yaml directly, but can be useful to mark undesirable
# entries for target applications. Being simply removed such entries likely
# will be restored on next updating invocation of loc2yaml.
""".format(
no_region=locode.UNKNOWN_REGION_CODE,
todo=locode.TODO_MARKER,
hint_tag=locode.PARSER_HINT_TAG,
hint_preserve=locode.PARSER_HINT_PRESERVE,
hint_odd=locode.PARSER_HINT_ODD))
f.write('\n')
locode.write_yml_data(yml_root, f)
f.close()
def do_gen():
# Parsing all CSV files into single dictionary
csv_data = {}
for file in cfg.src_files:
parse_csv_file(file, csv_data)
# Working either with all countries found in CSV files (if no country
# filter enabled) or only with specified country codes
if cfg.ctry_codes:
ctry_codes = cfg.ctry_codes & csv_data.keys()
missing = cfg.ctry_codes - csv_data.keys()
if missing:
sys.stderr.write("Warning: Country codes not found in CSV data: {}\n".format(", ".join(sorted(missing))))
else:
ctry_codes = set(csv_data.keys())
# Getting names of existent YAML files if necessary
yml_files = []
if not cfg.replace:
if not cfg.dest_is_file:
if ctry_codes:
for ctry_code in ctry_codes:
yml_file = cfg.dest_path + '/' + ctry_code.casefold() + ".yaml"
if os.path.isfile(yml_file):
yml_files.append(yml_file)
elif os.path.isdir(cfg.dest_path):
for file_name in os.listdir(cfg.dest_path):
if not file_name.endswith(".yaml") or len(file_name) != 7: # XX.yaml, where XX is 2-letter code
continue
file_name = cfg.dest_path + '/' + file_name
if os.path.isfile(file_name):
yml_files.append(file_name)
elif os.path.isfile(cfg.dest_path):
yml_files.append(cfg.dest_path)
# Parsing all YAML files into single dictionary
yml_root = {"country": {}}
for file_name in yml_files:
locode.parse_yml_file(file_name, yml_root, ctry_codes)
print_q("Generating data...")
stat = update_yml_data(yml_root, csv_data) # Go!
# Ensure destination path is exist
dest_path = cfg.dest_path
if cfg.dest_is_file:
dest_path = os.path.dirname(dest_path)
if not os.path.exists(dest_path):
os.makedirs(dest_path)
elif not os.path.isdir(dest_path):
raise RuntimeError("Destination path is not a directory: {}".format(dest_path))
# All destination files are created in temporary directory first
temp_dir = tempfile.TemporaryDirectory()
if not cfg.dest_is_file:
# Saving each country data into separate YAML file
for ctry_code, yml_ctry in yml_root["country"].items():
if len(ctry_code) != 2:
continue # Strange country entries are not going to be saved as separate file
save_yml_file({"country": {ctry_code: yml_ctry}}, temp_dir.name + '/' + ctry_code.lower() + ".yaml")
else:
# Saving everything into single YAML file
save_yml_file(yml_root, temp_dir.name + '/' + os.path.basename(cfg.dest_path))
# Finally copying all generated files into specified destination directory
locode.transact_copy(temp_dir.name, dest_path, cfg.replace)
# Printing update summary
if yml_files:
if stat.added:
print_v("Added locations:")
for code, name in sorted(stat.added):
print_v(" {}: {}".format(code, name))
if stat.deleted:
print_v("Removed locations (--no-obsolete enabled):")
for code, name in sorted(stat.deleted):
print_v(" {}: {}".format(code, name))
if stat.moved:
print_v("Locations moved to another region:")
for code_src, code_dest, name in sorted(stat.moved):
print_v(" {} -> {}: {}".format(code_src, code_dest, name))
if stat.conflict:
print_v("Locations with conflicting regions:")
for code_yml, code_csv, name in sorted(stat.conflict):
print_v(" {}, {} (UN/LOCODE): {}".format(code_yml, code_csv, name))
if stat.todo:
sys.stderr.write("""Warning: Some entry names are missing and should be provided manually, search
for {} marker in generated file contents.\n""".format(locode.TODO_MARKER))
if stat.added:
print_q("Summary: Added {} new location(s)".format(len(stat.added)))
else:
print_q("Summary: No new locations")
if stat.deleted:
print_q("Removed {} obsolete location(s)".format(len(stat.deleted)))
def do_merge():
# Parsing all YAML files into single dictionary
yml_src_data = {"country": {}}
for file_name in cfg.src_files:
locode.parse_yml_file(file_name, yml_src_data, cfg.ctry_codes)
if cfg.ctry_codes:
missing = cfg.ctry_codes - yml_src_data["country"].keys()
if missing:
sys.stderr.write("Warning: Country codes not found in YAML data: {}\n".format(", ".join(sorted(missing))))
if not cfg.replace and os.path.isfile(cfg.dest_path):
yml_dest_data = {"country": {}}
locode.parse_yml_file(cfg.dest_path, yml_dest_data, cfg.ctry_codes)
print_q("Merging data...")
yml_dest_data.update(yml_src_data)
else:
yml_dest_data = yml_src_data
# Ensure destination path is exist
assert cfg.dest_is_file
dest_path = os.path.dirname(cfg.dest_path)
if not os.path.exists(dest_path):
os.makedirs(dest_path)
elif not os.path.isdir(dest_path):
raise RuntimeError("Destination path is not a directory: {}".format(dest_path))
# Creating target file in temporary directory first
temp_dir = tempfile.TemporaryDirectory()
# Saving everything into single YAML file
save_yml_file(yml_dest_data, temp_dir.name + '/' + os.path.basename(cfg.dest_path))
# Finally copying all generated files into specified destination directory
locode.transact_copy(temp_dir.name, dest_path, cfg.replace)
def main(argv):
try:
# Parsing command-line arguments
parse_cmd_args(argv)
if not cfg.merge:
do_gen() # Parse CSV files and create/update YAML files
else:
do_merge() # Merge existent YAML files into single file
except getopt.GetoptError:
print_usage()
sys.exit(1)
except FileNotFoundError as e:
sys.stderr.write("Error: Unable to open file: {}\n".format(e.filename))
sys.exit(1)
except yaml.parser.ParserError as e:
mark = e.problem_mark
sys.stderr.write("Error: Unable to parse YAML file: {}, line {}: {}\n".format(mark.name, mark.line + 1, e.problem))
sys.exit(1)
except yaml.YAMLError as e:
sys.stderr.write("Error: Unable to process YAML data: {}\n".format(e))
sys.exit(1)
except RuntimeError as e:
sys.stderr.write("Error: {}\n".format(e))
sys.exit(1)
if __name__ == "__main__":
main(sys.argv)