-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_models_all.py
75 lines (60 loc) · 5.48 KB
/
convert_models_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import t5paths
import shutil
from convert_t5x_checkpoint_to_flax import convert_t5x_checkpoint_to_flax as convert_t5x
from create_pytorch_tf_and_vocab import create_pytorch_tf_and_vocab as create_transformers
from copy_additional_files_from_bucket import copy_additional_files_from_bucket as copy_adf
from huggingface_hub import HfApi, Repository
api = HfApi()
temp_paths = t5paths.t5paths()
model_local_dir = "/home/perk/models/"
forceConvert=True
forceTFPT=True
forceFiles=True
# Converts a single finetuned model
#temp_paths = [{'name': 'demo-nynorsk-base', 'path': 'gs://north-t5x/finetuned/bokmaal_nynorsk/nynorsk_balanced_base_v1/', 'checkpoint': 'gs://north-t5x/finetuned/bokmaal_nynorsk/nynorsk_balanced_base_v1/checkpoint_1703000', 'private': True, 'size': 'base'}]
#temp_paths = [{'name': 'demo-deuncaser-base', 'path': 'gs://north-t5x/finetuned/deuncaser/deuncaser_base_v1/', 'checkpoint': 'gs://north-t5x/finetuned/deuncaser/deuncaser_base_v1/checkpoint_1750000', 'private': True, 'size': 'base'}]
#temp_paths = [{'name': 't5_base_scand3M', 'path': 'gs://nb-t5x-us-central2/scandinavian3k_solo_t5x_base/', 'checkpoint': 'gs://nb-t5x-us-central2/scandinavian3k_solo_t5x_base/checkpoint_3000000', 'private': True, 'size': 'base'}]
#temp_paths = [{'name': 't5_large_scand3M', 'path': 'gs://nb-t5x-us-central2/scandinavian3k_t5x_large/', 'checkpoint': 'gs://nb-t5x-us-central2/scandinavian3k_t5x_large/checkpoint_3000000', 'private': True, 'size': 'large'}, {'name': 't5_xl_scand3M', 'path': 'gs://nb-t5x-us-central2/scandinavian3k_t5x_xl/', 'checkpoint': 'gs://nb-t5x-us-central2/scandinavian3k_t5x_xl/checkpoint_3000000', 'private': True, 'size': 'xl'}]
# Samisk
#temp_paths = [{'name': 'fine_North_base', 'path': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_base/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_base/checkpoint_3138000', 'private': False, 'size': 'base'}, {'name': 'fine_North_large', 'path': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_large/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_large/checkpoint_3170000', 'private': True, 'size': 'large'}, {'name': 'fine_North_xl', 'path': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_xl/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/scandi3_3stammer_v2_xl/checkpoint_3100000', 'private': True, 'size': 'xl'}
# Nynorsk
#temp_paths =
#[{'name': 'nynorsk_North_small', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_v5/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_v5/checkpoint_1705000', 'private': False, 'size': 'small'}, {'name': 'nynorsk_North_base', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_NCC_base_v2/checkpoint_1505000', 'private': False, 'size': 'base'}, {'name': 'nynorsk_North_large', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_large_v1/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_large_v1/checkpoint_1705000', 'private': False, 'size': 'large'}]
# Nynorsk Long
temp_paths = [{'name': 'nynorsk_North_small_long', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v1/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_small_long_v1/checkpoint_1705000', 'private': False, 'size': 'small'}, {'name': 'nynorsk_North_base_long', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_base_long_v1/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_base_long_v1/checkpoint_1705000', 'private': False, 'size': 'base'}, {'name': 'nynorsk_North_large_long', 'path': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_large_long_v1/', 'checkpoint': 'gs://nb-t5x-us-central2/finetuned/nynorsk_balanced_large_long_v1/checkpoint_1705000', 'private': False, 'size': 'large'}]
#For debugging - working on the first one
#temp_paths = temp_paths[1:2]
#print(temp_paths)
paths = []
for m in temp_paths:
if "xxl" not in m['name']:
paths.append(m)
print(m['name'])
for m in paths:
print(model_local_dir+m['name'])
repo = Repository(local_dir=model_local_dir+m['name'])
repo.git_pull()
if forceConvert or not os.path.exists(model_local_dir+m['name']+"/flax_model.msgpack"):
print(f"***Starting to convert {m['name']}")
if "byt5" not in m['name']:
shutil.copyfile(m['size']+'.json',model_local_dir+m['name']+'/config.json')
convert_t5x(m['checkpoint'],m['size']+'.json',model_local_dir+m['name'])
else:
shutil.copyfile("byt5_"+m['size']+'.json',model_local_dir+m['name']+'/config.json')
convert_t5x(m['checkpoint'],"byt5_"+m['size']+'.json',model_local_dir+m['name'])
else:
print("***Dropping conversion")
if forceFiles or not os.path.exists(model_local_dir+m['name']+"/config.gin"):
print(f"***Starting to copy additional files to {m['name']} from the bucket")
copy_adf(m['path'],model_local_dir+m['name'])
else:
print("***Dropping to copy files")
if forceTFPT or not os.path.exists(model_local_dir+m['name']+"/pytorch_model.bin"):
print(f"***Starting to convert {m['name']} to pyTorch and tensorflow")
create_transformers(model_local_dir+m['name'],m['size'])
else:
print("***Dropping conversion to PyTorch and Tensorflow")
print(f"***Starting to push all the files for {m['name']} to the hub")
repo.push_to_hub(commit_message="Commit from model create scripts")
print(f"Finished pushing {m['name']}.")