diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index d77bf1a..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,144 +0,0 @@
-# General
-.DS_Store
-.AppleDouble
-.LSOverride
-
-# Icon must end with two \r
-Icon
-
-
-# Thumbnails
-._*
-
-# Files that might appear in the root of a volume
-.DocumentRevisions-V100
-.fseventsd
-.Spotlight-V100
-.TemporaryItems
-.Trashes
-.VolumeIcon.icns
-.com.apple.timemachine.donotpresent
-
-# Directories potentially created on remote AFP share
-.AppleDB
-.AppleDesktop
-Network Trash Folder
-Temporary Items
-.apdisk
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
diff --git a/AVLFormer/.gitignore b/AVLFormer/.gitignore
deleted file mode 100644
index 5ff8ef4..0000000
--- a/AVLFormer/.gitignore
+++ /dev/null
@@ -1,22 +0,0 @@
-src/evalcap/cider
-src/evalcap/cider/*
-src/evalcap/coco_caption
-src/evalcap/coco_caption/*
-src/evalcap/coco-caption
-src/evalcap/coco-caption/*
-
-# ignore dataset
-datasets/meta
-datasets/meta/*
-datasets/metadata
-datasets/metadata/*
-datasets/videos
-datasets/videos/*
-datasets/audios
-datasets/audios/*
-datasets/frames
-datasets/frames/*
-datasets/frame_tsv
-datasets/frame_tsv/*
-datasets/mp3_audio
-datasets/mp3_audio/*
\ No newline at end of file
diff --git a/AVLFormer/captioning/bert-base-uncased/added_tokens.json b/AVLFormer/captioning/bert-base-uncased/added_tokens.json
deleted file mode 100755
index 9e26dfe..0000000
--- a/AVLFormer/captioning/bert-base-uncased/added_tokens.json
+++ /dev/null
@@ -1 +0,0 @@
-{}
\ No newline at end of file
diff --git a/AVLFormer/captioning/bert-base-uncased/config.json b/AVLFormer/captioning/bert-base-uncased/config.json
deleted file mode 100755
index 7927667..0000000
--- a/AVLFormer/captioning/bert-base-uncased/config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "architectures": [
-    "BertForMaskedLM"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "max_position_embeddings": 512,
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "type_vocab_size": 2,
-  "vocab_size": 30522
-}
diff --git a/AVLFormer/captioning/bert-base-uncased/special_tokens_map.json b/AVLFormer/captioning/bert-base-uncased/special_tokens_map.json
deleted file mode 100755
index e7b0375..0000000
--- a/AVLFormer/captioning/bert-base-uncased/special_tokens_map.json
+++ /dev/null
@@ -1 +0,0 @@
-{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
\ No newline at end of file
diff --git a/AVLFormer/captioning/bert-base-uncased/vocab.txt b/AVLFormer/captioning/bert-base-uncased/vocab.txt
deleted file mode 100755
index fb14027..0000000
--- a/AVLFormer/captioning/bert-base-uncased/vocab.txt
+++ /dev/null
@@ -1,30522 +0,0 @@
-[PAD]
-[unused0]
-[unused1]
-[unused2]
-[unused3]
-[unused4]
-[unused5]
-[unused6]
-[unused7]
-[unused8]
-[unused9]
-[unused10]
-[unused11]
-[unused12]
-[unused13]
-[unused14]
-[unused15]
-[unused16]
-[unused17]
-[unused18]
-[unused19]
-[unused20]
-[unused21]
-[unused22]
-[unused23]
-[unused24]
-[unused25]
-[unused26]
-[unused27]
-[unused28]
-[unused29]
-[unused30]
-[unused31]
-[unused32]
-[unused33]
-[unused34]
-[unused35]
-[unused36]
-[unused37]
-[unused38]
-[unused39]
-[unused40]
-[unused41]
-[unused42]
-[unused43]
-[unused44]
-[unused45]
-[unused46]
-[unused47]
-[unused48]
-[unused49]
-[unused50]
-[unused51]
-[unused52]
-[unused53]
-[unused54]
-[unused55]
-[unused56]
-[unused57]
-[unused58]
-[unused59]
-[unused60]
-[unused61]
-[unused62]
-[unused63]
-[unused64]
-[unused65]
-[unused66]
-[unused67]
-[unused68]
-[unused69]
-[unused70]
-[unused71]
-[unused72]
-[unused73]
-[unused74]
-[unused75]
-[unused76]
-[unused77]
-[unused78]
-[unused79]
-[unused80]
-[unused81]
-[unused82]
-[unused83]
-[unused84]
-[unused85]
-[unused86]
-[unused87]
-[unused88]
-[unused89]
-[unused90]
-[unused91]
-[unused92]
-[unused93]
-[unused94]
-[unused95]
-[unused96]
-[unused97]
-[unused98]
-[UNK]
-[CLS]
-[SEP]
-[MASK]
-[unused99]
-[unused100]
-[unused101]
-[unused102]
-[unused103]
-[unused104]
-[unused105]
-[unused106]
-[unused107]
-[unused108]
-[unused109]
-[unused110]
-[unused111]
-[unused112]
-[unused113]
-[unused114]
-[unused115]
-[unused116]
-[unused117]
-[unused118]
-[unused119]
-[unused120]
-[unused121]
-[unused122]
-[unused123]
-[unused124]
-[unused125]
-[unused126]
-[unused127]
-[unused128]
-[unused129]
-[unused130]
-[unused131]
-[unused132]
-[unused133]
-[unused134]
-[unused135]
-[unused136]
-[unused137]
-[unused138]
-[unused139]
-[unused140]
-[unused141]
-[unused142]
-[unused143]
-[unused144]
-[unused145]
-[unused146]
-[unused147]
-[unused148]
-[unused149]
-[unused150]
-[unused151]
-[unused152]
-[unused153]
-[unused154]
-[unused155]
-[unused156]
-[unused157]
-[unused158]
-[unused159]
-[unused160]
-[unused161]
-[unused162]
-[unused163]
-[unused164]
-[unused165]
-[unused166]
-[unused167]
-[unused168]
-[unused169]
-[unused170]
-[unused171]
-[unused172]
-[unused173]
-[unused174]
-[unused175]
-[unused176]
-[unused177]
-[unused178]
-[unused179]
-[unused180]
-[unused181]
-[unused182]
-[unused183]
-[unused184]
-[unused185]
-[unused186]
-[unused187]
-[unused188]
-[unused189]
-[unused190]
-[unused191]
-[unused192]
-[unused193]
-[unused194]
-[unused195]
-[unused196]
-[unused197]
-[unused198]
-[unused199]
-[unused200]
-[unused201]
-[unused202]
-[unused203]
-[unused204]
-[unused205]
-[unused206]
-[unused207]
-[unused208]
-[unused209]
-[unused210]
-[unused211]
-[unused212]
-[unused213]
-[unused214]
-[unused215]
-[unused216]
-[unused217]
-[unused218]
-[unused219]
-[unused220]
-[unused221]
-[unused222]
-[unused223]
-[unused224]
-[unused225]
-[unused226]
-[unused227]
-[unused228]
-[unused229]
-[unused230]
-[unused231]
-[unused232]
-[unused233]
-[unused234]
-[unused235]
-[unused236]
-[unused237]
-[unused238]
-[unused239]
-[unused240]
-[unused241]
-[unused242]
-[unused243]
-[unused244]
-[unused245]
-[unused246]
-[unused247]
-[unused248]
-[unused249]
-[unused250]
-[unused251]
-[unused252]
-[unused253]
-[unused254]
-[unused255]
-[unused256]
-[unused257]
-[unused258]
-[unused259]
-[unused260]
-[unused261]
-[unused262]
-[unused263]
-[unused264]
-[unused265]
-[unused266]
-[unused267]
-[unused268]
-[unused269]
-[unused270]
-[unused271]
-[unused272]
-[unused273]
-[unused274]
-[unused275]
-[unused276]
-[unused277]
-[unused278]
-[unused279]
-[unused280]
-[unused281]
-[unused282]
-[unused283]
-[unused284]
-[unused285]
-[unused286]
-[unused287]
-[unused288]
-[unused289]
-[unused290]
-[unused291]
-[unused292]
-[unused293]
-[unused294]
-[unused295]
-[unused296]
-[unused297]
-[unused298]
-[unused299]
-[unused300]
-[unused301]
-[unused302]
-[unused303]
-[unused304]
-[unused305]
-[unused306]
-[unused307]
-[unused308]
-[unused309]
-[unused310]
-[unused311]
-[unused312]
-[unused313]
-[unused314]
-[unused315]
-[unused316]
-[unused317]
-[unused318]
-[unused319]
-[unused320]
-[unused321]
-[unused322]
-[unused323]
-[unused324]
-[unused325]
-[unused326]
-[unused327]
-[unused328]
-[unused329]
-[unused330]
-[unused331]
-[unused332]
-[unused333]
-[unused334]
-[unused335]
-[unused336]
-[unused337]
-[unused338]
-[unused339]
-[unused340]
-[unused341]
-[unused342]
-[unused343]
-[unused344]
-[unused345]
-[unused346]
-[unused347]
-[unused348]
-[unused349]
-[unused350]
-[unused351]
-[unused352]
-[unused353]
-[unused354]
-[unused355]
-[unused356]
-[unused357]
-[unused358]
-[unused359]
-[unused360]
-[unused361]
-[unused362]
-[unused363]
-[unused364]
-[unused365]
-[unused366]
-[unused367]
-[unused368]
-[unused369]
-[unused370]
-[unused371]
-[unused372]
-[unused373]
-[unused374]
-[unused375]
-[unused376]
-[unused377]
-[unused378]
-[unused379]
-[unused380]
-[unused381]
-[unused382]
-[unused383]
-[unused384]
-[unused385]
-[unused386]
-[unused387]
-[unused388]
-[unused389]
-[unused390]
-[unused391]
-[unused392]
-[unused393]
-[unused394]
-[unused395]
-[unused396]
-[unused397]
-[unused398]
-[unused399]
-[unused400]
-[unused401]
-[unused402]
-[unused403]
-[unused404]
-[unused405]
-[unused406]
-[unused407]
-[unused408]
-[unused409]
-[unused410]
-[unused411]
-[unused412]
-[unused413]
-[unused414]
-[unused415]
-[unused416]
-[unused417]
-[unused418]
-[unused419]
-[unused420]
-[unused421]
-[unused422]
-[unused423]
-[unused424]
-[unused425]
-[unused426]
-[unused427]
-[unused428]
-[unused429]
-[unused430]
-[unused431]
-[unused432]
-[unused433]
-[unused434]
-[unused435]
-[unused436]
-[unused437]
-[unused438]
-[unused439]
-[unused440]
-[unused441]
-[unused442]
-[unused443]
-[unused444]
-[unused445]
-[unused446]
-[unused447]
-[unused448]
-[unused449]
-[unused450]
-[unused451]
-[unused452]
-[unused453]
-[unused454]
-[unused455]
-[unused456]
-[unused457]
-[unused458]
-[unused459]
-[unused460]
-[unused461]
-[unused462]
-[unused463]
-[unused464]
-[unused465]
-[unused466]
-[unused467]
-[unused468]
-[unused469]
-[unused470]
-[unused471]
-[unused472]
-[unused473]
-[unused474]
-[unused475]
-[unused476]
-[unused477]
-[unused478]
-[unused479]
-[unused480]
-[unused481]
-[unused482]
-[unused483]
-[unused484]
-[unused485]
-[unused486]
-[unused487]
-[unused488]
-[unused489]
-[unused490]
-[unused491]
-[unused492]
-[unused493]
-[unused494]
-[unused495]
-[unused496]
-[unused497]
-[unused498]
-[unused499]
-[unused500]
-[unused501]
-[unused502]
-[unused503]
-[unused504]
-[unused505]
-[unused506]
-[unused507]
-[unused508]
-[unused509]
-[unused510]
-[unused511]
-[unused512]
-[unused513]
-[unused514]
-[unused515]
-[unused516]
-[unused517]
-[unused518]
-[unused519]
-[unused520]
-[unused521]
-[unused522]
-[unused523]
-[unused524]
-[unused525]
-[unused526]
-[unused527]
-[unused528]
-[unused529]
-[unused530]
-[unused531]
-[unused532]
-[unused533]
-[unused534]
-[unused535]
-[unused536]
-[unused537]
-[unused538]
-[unused539]
-[unused540]
-[unused541]
-[unused542]
-[unused543]
-[unused544]
-[unused545]
-[unused546]
-[unused547]
-[unused548]
-[unused549]
-[unused550]
-[unused551]
-[unused552]
-[unused553]
-[unused554]
-[unused555]
-[unused556]
-[unused557]
-[unused558]
-[unused559]
-[unused560]
-[unused561]
-[unused562]
-[unused563]
-[unused564]
-[unused565]
-[unused566]
-[unused567]
-[unused568]
-[unused569]
-[unused570]
-[unused571]
-[unused572]
-[unused573]
-[unused574]
-[unused575]
-[unused576]
-[unused577]
-[unused578]
-[unused579]
-[unused580]
-[unused581]
-[unused582]
-[unused583]
-[unused584]
-[unused585]
-[unused586]
-[unused587]
-[unused588]
-[unused589]
-[unused590]
-[unused591]
-[unused592]
-[unused593]
-[unused594]
-[unused595]
-[unused596]
-[unused597]
-[unused598]
-[unused599]
-[unused600]
-[unused601]
-[unused602]
-[unused603]
-[unused604]
-[unused605]
-[unused606]
-[unused607]
-[unused608]
-[unused609]
-[unused610]
-[unused611]
-[unused612]
-[unused613]
-[unused614]
-[unused615]
-[unused616]
-[unused617]
-[unused618]
-[unused619]
-[unused620]
-[unused621]
-[unused622]
-[unused623]
-[unused624]
-[unused625]
-[unused626]
-[unused627]
-[unused628]
-[unused629]
-[unused630]
-[unused631]
-[unused632]
-[unused633]
-[unused634]
-[unused635]
-[unused636]
-[unused637]
-[unused638]
-[unused639]
-[unused640]
-[unused641]
-[unused642]
-[unused643]
-[unused644]
-[unused645]
-[unused646]
-[unused647]
-[unused648]
-[unused649]
-[unused650]
-[unused651]
-[unused652]
-[unused653]
-[unused654]
-[unused655]
-[unused656]
-[unused657]
-[unused658]
-[unused659]
-[unused660]
-[unused661]
-[unused662]
-[unused663]
-[unused664]
-[unused665]
-[unused666]
-[unused667]
-[unused668]
-[unused669]
-[unused670]
-[unused671]
-[unused672]
-[unused673]
-[unused674]
-[unused675]
-[unused676]
-[unused677]
-[unused678]
-[unused679]
-[unused680]
-[unused681]
-[unused682]
-[unused683]
-[unused684]
-[unused685]
-[unused686]
-[unused687]
-[unused688]
-[unused689]
-[unused690]
-[unused691]
-[unused692]
-[unused693]
-[unused694]
-[unused695]
-[unused696]
-[unused697]
-[unused698]
-[unused699]
-[unused700]
-[unused701]
-[unused702]
-[unused703]
-[unused704]
-[unused705]
-[unused706]
-[unused707]
-[unused708]
-[unused709]
-[unused710]
-[unused711]
-[unused712]
-[unused713]
-[unused714]
-[unused715]
-[unused716]
-[unused717]
-[unused718]
-[unused719]
-[unused720]
-[unused721]
-[unused722]
-[unused723]
-[unused724]
-[unused725]
-[unused726]
-[unused727]
-[unused728]
-[unused729]
-[unused730]
-[unused731]
-[unused732]
-[unused733]
-[unused734]
-[unused735]
-[unused736]
-[unused737]
-[unused738]
-[unused739]
-[unused740]
-[unused741]
-[unused742]
-[unused743]
-[unused744]
-[unused745]
-[unused746]
-[unused747]
-[unused748]
-[unused749]
-[unused750]
-[unused751]
-[unused752]
-[unused753]
-[unused754]
-[unused755]
-[unused756]
-[unused757]
-[unused758]
-[unused759]
-[unused760]
-[unused761]
-[unused762]
-[unused763]
-[unused764]
-[unused765]
-[unused766]
-[unused767]
-[unused768]
-[unused769]
-[unused770]
-[unused771]
-[unused772]
-[unused773]
-[unused774]
-[unused775]
-[unused776]
-[unused777]
-[unused778]
-[unused779]
-[unused780]
-[unused781]
-[unused782]
-[unused783]
-[unused784]
-[unused785]
-[unused786]
-[unused787]
-[unused788]
-[unused789]
-[unused790]
-[unused791]
-[unused792]
-[unused793]
-[unused794]
-[unused795]
-[unused796]
-[unused797]
-[unused798]
-[unused799]
-[unused800]
-[unused801]
-[unused802]
-[unused803]
-[unused804]
-[unused805]
-[unused806]
-[unused807]
-[unused808]
-[unused809]
-[unused810]
-[unused811]
-[unused812]
-[unused813]
-[unused814]
-[unused815]
-[unused816]
-[unused817]
-[unused818]
-[unused819]
-[unused820]
-[unused821]
-[unused822]
-[unused823]
-[unused824]
-[unused825]
-[unused826]
-[unused827]
-[unused828]
-[unused829]
-[unused830]
-[unused831]
-[unused832]
-[unused833]
-[unused834]
-[unused835]
-[unused836]
-[unused837]
-[unused838]
-[unused839]
-[unused840]
-[unused841]
-[unused842]
-[unused843]
-[unused844]
-[unused845]
-[unused846]
-[unused847]
-[unused848]
-[unused849]
-[unused850]
-[unused851]
-[unused852]
-[unused853]
-[unused854]
-[unused855]
-[unused856]
-[unused857]
-[unused858]
-[unused859]
-[unused860]
-[unused861]
-[unused862]
-[unused863]
-[unused864]
-[unused865]
-[unused866]
-[unused867]
-[unused868]
-[unused869]
-[unused870]
-[unused871]
-[unused872]
-[unused873]
-[unused874]
-[unused875]
-[unused876]
-[unused877]
-[unused878]
-[unused879]
-[unused880]
-[unused881]
-[unused882]
-[unused883]
-[unused884]
-[unused885]
-[unused886]
-[unused887]
-[unused888]
-[unused889]
-[unused890]
-[unused891]
-[unused892]
-[unused893]
-[unused894]
-[unused895]
-[unused896]
-[unused897]
-[unused898]
-[unused899]
-[unused900]
-[unused901]
-[unused902]
-[unused903]
-[unused904]
-[unused905]
-[unused906]
-[unused907]
-[unused908]
-[unused909]
-[unused910]
-[unused911]
-[unused912]
-[unused913]
-[unused914]
-[unused915]
-[unused916]
-[unused917]
-[unused918]
-[unused919]
-[unused920]
-[unused921]
-[unused922]
-[unused923]
-[unused924]
-[unused925]
-[unused926]
-[unused927]
-[unused928]
-[unused929]
-[unused930]
-[unused931]
-[unused932]
-[unused933]
-[unused934]
-[unused935]
-[unused936]
-[unused937]
-[unused938]
-[unused939]
-[unused940]
-[unused941]
-[unused942]
-[unused943]
-[unused944]
-[unused945]
-[unused946]
-[unused947]
-[unused948]
-[unused949]
-[unused950]
-[unused951]
-[unused952]
-[unused953]
-[unused954]
-[unused955]
-[unused956]
-[unused957]
-[unused958]
-[unused959]
-[unused960]
-[unused961]
-[unused962]
-[unused963]
-[unused964]
-[unused965]
-[unused966]
-[unused967]
-[unused968]
-[unused969]
-[unused970]
-[unused971]
-[unused972]
-[unused973]
-[unused974]
-[unused975]
-[unused976]
-[unused977]
-[unused978]
-[unused979]
-[unused980]
-[unused981]
-[unused982]
-[unused983]
-[unused984]
-[unused985]
-[unused986]
-[unused987]
-[unused988]
-[unused989]
-[unused990]
-[unused991]
-[unused992]
-[unused993]
-!
-"
-#
-$
-%
-&
-'
-(
-)
-*
-+
-,
--
-.
-/
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-:
-;
-<
-=
->
-?
-@
-[
-\
-]
-^
-_
-`
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-{
-|
-}
-~
-¡
-¢
-£
-¤
-¥
-¦
-§
-¨
-©
-ª
-«
-¬
-®
-°
-±
-²
-³
-´
-µ
-¶
-·
-¹
-º
-»
-¼
-½
-¾
-¿
-×
-ß
-æ
-ð
-÷
-ø
-þ
-đ
-ħ
-ı
-ł
-ŋ
-œ
-ƒ
-ɐ
-ɑ
-ɒ
-ɔ
-ɕ
-ə
-ɛ
-ɡ
-ɣ
-ɨ
-ɪ
-ɫ
-ɬ
-ɯ
-ɲ
-ɴ
-ɹ
-ɾ
-ʀ
-ʁ
-ʂ
-ʃ
-ʉ
-ʊ
-ʋ
-ʌ
-ʎ
-ʐ
-ʑ
-ʒ
-ʔ
-ʰ
-ʲ
-ʳ
-ʷ
-ʸ
-ʻ
-ʼ
-ʾ
-ʿ
-ˈ
-ː
-ˡ
-ˢ
-ˣ
-ˤ
-α
-β
-γ
-δ
-ε
-ζ
-η
-θ
-ι
-κ
-λ
-μ
-ν
-ξ
-ο
-π
-ρ
-ς
-σ
-τ
-υ
-φ
-χ
-ψ
-ω
-а
-б
-в
-г
-д
-е
-ж
-з
-и
-к
-л
-м
-н
-о
-п
-р
-с
-т
-у
-ф
-х
-ц
-ч
-ш
-щ
-ъ
-ы
-ь
-э
-ю
-я
-ђ
-є
-і
-ј
-љ
-њ
-ћ
-ӏ
-ա
-բ
-գ
-դ
-ե
-թ
-ի
-լ
-կ
-հ
-մ
-յ
-ն
-ո
-պ
-ս
-վ
-տ
-ր
-ւ
-ք
-־
-א
-ב
-ג
-ד
-ה
-ו
-ז
-ח
-ט
-י
-ך
-כ
-ל
-ם
-מ
-ן
-נ
-ס
-ע
-ף
-פ
-ץ
-צ
-ק
-ר
-ש
-ת
-،
-ء
-ا
-ب
-ة
-ت
-ث
-ج
-ح
-خ
-د
-ذ
-ر
-ز
-س
-ش
-ص
-ض
-ط
-ظ
-ع
-غ
-ـ
-ف
-ق
-ك
-ل
-م
-ن
-ه
-و
-ى
-ي
-ٹ
-پ
-چ
-ک
-گ
-ں
-ھ
-ہ
-ی
-ے
-अ
-आ
-उ
-ए
-क
-ख
-ग
-च
-ज
-ट
-ड
-ण
-त
-थ
-द
-ध
-न
-प
-ब
-भ
-म
-य
-र
-ल
-व
-श
-ष
-स
-ह
-ा
-ि
-ी
-ो
-।
-॥
-ং
-অ
-আ
-ই
-উ
-এ
-ও
-ক
-খ
-গ
-চ
-ছ
-জ
-ট
-ড
-ণ
-ত
-থ
-দ
-ধ
-ন
-প
-ব
-ভ
-ম
-য
-র
-ল
-শ
-ষ
-স
-হ
-া
-ি
-ী
-ে
-க
-ச
-ட
-த
-ந
-ன
-ப
-ம
-ய
-ர
-ல
-ள
-வ
-ா
-ி
-ு
-ே
-ை
-ನ
-ರ
-ಾ
-ක
-ය
-ර
-ල
-ව
-ා
-ก
-ง
-ต
-ท
-น
-พ
-ม
-ย
-ร
-ล
-ว
-ส
-อ
-า
-เ
-་
-།
-ག
-ང
-ད
-ན
-པ
-བ
-མ
-འ
-ར
-ལ
-ས
-မ
-ა
-ბ
-გ
-დ
-ე
-ვ
-თ
-ი
-კ
-ლ
-მ
-ნ
-ო
-რ
-ს
-ტ
-უ
-ᄀ
-ᄂ
-ᄃ
-ᄅ
-ᄆ
-ᄇ
-ᄉ
-ᄊ
-ᄋ
-ᄌ
-ᄎ
-ᄏ
-ᄐ
-ᄑ
-ᄒ
-ᅡ
-ᅢ
-ᅥ
-ᅦ
-ᅧ
-ᅩ
-ᅪ
-ᅭ
-ᅮ
-ᅯ
-ᅲ
-ᅳ
-ᅴ
-ᅵ
-ᆨ
-ᆫ
-ᆯ
-ᆷ
-ᆸ
-ᆼ
-ᴬ
-ᴮ
-ᴰ
-ᴵ
-ᴺ
-ᵀ
-ᵃ
-ᵇ
-ᵈ
-ᵉ
-ᵍ
-ᵏ
-ᵐ
-ᵒ
-ᵖ
-ᵗ
-ᵘ
-ᵢ
-ᵣ
-ᵤ
-ᵥ
-ᶜ
-ᶠ
-‐
-‑
-‒
-–
-—
-―
-‖
-‘
-’
-‚
-“
-”
-„
-†
-‡
-•
-…
-‰
-′
-″
-›
-‿
-⁄
-⁰
-ⁱ
-⁴
-⁵
-⁶
-⁷
-⁸
-⁹
-⁺
-⁻
-ⁿ
-₀
-₁
-₂
-₃
-₄
-₅
-₆
-₇
-₈
-₉
-₊
-₍
-₎
-ₐ
-ₑ
-ₒ
-ₓ
-ₕ
-ₖ
-ₗ
-ₘ
-ₙ
-ₚ
-ₛ
-ₜ
-₤
-₩
-€
-₱
-₹
-ℓ
-№
-ℝ
-™
-⅓
-⅔
-←
-↑
-→
-↓
-↔
-↦
-⇄
-⇌
-⇒
-∂
-∅
-∆
-∇
-∈
-−
-∗
-∘
-√
-∞
-∧
-∨
-∩
-∪
-≈
-≡
-≤
-≥
-⊂
-⊆
-⊕
-⊗
-⋅
-─
-│
-■
-▪
-●
-★
-☆
-☉
-♠
-♣
-♥
-♦
-♭
-♯
-⟨
-⟩
-ⱼ
-⺩
-⺼
-⽥
-、
-。
-〈
-〉
-《
-》
-「
-」
-『
-』
-〜
-あ
-い
-う
-え
-お
-か
-き
-く
-け
-こ
-さ
-し
-す
-せ
-そ
-た
-ち
-っ
-つ
-て
-と
-な
-に
-ぬ
-ね
-の
-は
-ひ
-ふ
-へ
-ほ
-ま
-み
-む
-め
-も
-や
-ゆ
-よ
-ら
-り
-る
-れ
-ろ
-を
-ん
-ァ
-ア
-ィ
-イ
-ウ
-ェ
-エ
-オ
-カ
-キ
-ク
-ケ
-コ
-サ
-シ
-ス
-セ
-タ
-チ
-ッ
-ツ
-テ
-ト
-ナ
-ニ
-ノ
-ハ
-ヒ
-フ
-ヘ
-ホ
-マ
-ミ
-ム
-メ
-モ
-ャ
-ュ
-ョ
-ラ
-リ
-ル
-レ
-ロ
-ワ
-ン
-・
-ー
-一
-三
-上
-下
-不
-世
-中
-主
-久
-之
-也
-事
-二
-五
-井
-京
-人
-亻
-仁
-介
-代
-仮
-伊
-会
-佐
-侍
-保
-信
-健
-元
-光
-八
-公
-内
-出
-分
-前
-劉
-力
-加
-勝
-北
-区
-十
-千
-南
-博
-原
-口
-古
-史
-司
-合
-吉
-同
-名
-和
-囗
-四
-国
-國
-土
-地
-坂
-城
-堂
-場
-士
-夏
-外
-大
-天
-太
-夫
-奈
-女
-子
-学
-宀
-宇
-安
-宗
-定
-宣
-宮
-家
-宿
-寺
-將
-小
-尚
-山
-岡
-島
-崎
-川
-州
-巿
-帝
-平
-年
-幸
-广
-弘
-張
-彳
-後
-御
-德
-心
-忄
-志
-忠
-愛
-成
-我
-戦
-戸
-手
-扌
-政
-文
-新
-方
-日
-明
-星
-春
-昭
-智
-曲
-書
-月
-有
-朝
-木
-本
-李
-村
-東
-松
-林
-森
-楊
-樹
-橋
-歌
-止
-正
-武
-比
-氏
-民
-水
-氵
-氷
-永
-江
-沢
-河
-治
-法
-海
-清
-漢
-瀬
-火
-版
-犬
-王
-生
-田
-男
-疒
-発
-白
-的
-皇
-目
-相
-省
-真
-石
-示
-社
-神
-福
-禾
-秀
-秋
-空
-立
-章
-竹
-糹
-美
-義
-耳
-良
-艹
-花
-英
-華
-葉
-藤
-行
-街
-西
-見
-訁
-語
-谷
-貝
-貴
-車
-軍
-辶
-道
-郎
-郡
-部
-都
-里
-野
-金
-鈴
-镇
-長
-門
-間
-阝
-阿
-陳
-陽
-雄
-青
-面
-風
-食
-香
-馬
-高
-龍
-龸
-ﬁ
-ﬂ
-！
-（
-）
-，
-－
-．
-／
-：
-？
-～
-the
-of
-and
-in
-to
-was
-he
-is
-as
-for
-on
-with
-that
-it
-his
-by
-at
-from
-her
-##s
-she
-you
-had
-an
-were
-but
-be
-this
-are
-not
-my
-they
-one
-which
-or
-have
-him
-me
-first
-all
-also
-their
-has
-up
-who
-out
-been
-when
-after
-there
-into
-new
-two
-its
-##a
-time
-would
-no
-what
-about
-said
-we
-over
-then
-other
-so
-more
-##e
-can
-if
-like
-back
-them
-only
-some
-could
-##i
-where
-just
-##ing
-during
-before
-##n
-do
-##o
-made
-school
-through
-than
-now
-years
-most
-world
-may
-between
-down
-well
-three
-##d
-year
-while
-will
-##ed
-##r
-##y
-later
-##t
-city
-under
-around
-did
-such
-being
-used
-state
-people
-part
-know
-against
-your
-many
-second
-university
-both
-national
-##er
-these
-don
-known
-off
-way
-until
-re
-how
-even
-get
-head
-...
-didn
-##ly
-team
-american
-because
-de
-##l
-born
-united
-film
-since
-still
-long
-work
-south
-us
-became
-any
-high
-again
-day
-family
-see
-right
-man
-eyes
-house
-season
-war
-states
-including
-took
-life
-north
-same
-each
-called
-name
-much
-place
-however
-go
-four
-group
-another
-found
-won
-area
-here
-going
-10
-away
-series
-left
-home
-music
-best
-make
-hand
-number
-company
-several
-never
-last
-john
-000
-very
-album
-take
-end
-good
-too
-following
-released
-game
-played
-little
-began
-district
-##m
-old
-want
-those
-side
-held
-own
-early
-county
-ll
-league
-use
-west
-##u
-face
-think
-##es
-2010
-government
-##h
-march
-came
-small
-general
-town
-june
-##on
-line
-based
-something
-##k
-september
-thought
-looked
-along
-international
-2011
-air
-july
-club
-went
-january
-october
-our
-august
-april
-york
-12
-few
-2012
-2008
-east
-show
-member
-college
-2009
-father
-public
-##us
-come
-men
-five
-set
-station
-church
-##c
-next
-former
-november
-room
-party
-located
-december
-2013
-age
-got
-2007
-##g
-system
-let
-love
-2006
-though
-every
-2014
-look
-song
-water
-century
-without
-body
-black
-night
-within
-great
-women
-single
-ve
-building
-large
-population
-river
-named
-band
-white
-started
-##an
-once
-15
-20
-should
-18
-2015
-service
-top
-built
-british
-open
-death
-king
-moved
-local
-times
-children
-february
-book
-why
-11
-door
-need
-president
-order
-final
-road
-wasn
-although
-due
-major
-died
-village
-third
-knew
-2016
-asked
-turned
-st
-wanted
-say
-##p
-together
-received
-main
-son
-served
-different
-##en
-behind
-himself
-felt
-members
-power
-football
-law
-voice
-play
-##in
-near
-park
-history
-30
-having
-2005
-16
-##man
-saw
-mother
-##al
-army
-point
-front
-help
-english
-street
-art
-late
-hands
-games
-award
-##ia
-young
-14
-put
-published
-country
-division
-across
-told
-13
-often
-ever
-french
-london
-center
-six
-red
-2017
-led
-days
-include
-light
-25
-find
-tell
-among
-species
-really
-according
-central
-half
-2004
-form
-original
-gave
-office
-making
-enough
-lost
-full
-opened
-must
-included
-live
-given
-german
-player
-run
-business
-woman
-community
-cup
-might
-million
-land
-2000
-court
-development
-17
-short
-round
-ii
-km
-seen
-class
-story
-always
-become
-sure
-research
-almost
-director
-council
-la
-##2
-career
-things
-using
-island
-##z
-couldn
-car
-##is
-24
-close
-force
-##1
-better
-free
-support
-control
-field
-students
-2003
-education
-married
-##b
-nothing
-worked
-others
-record
-big
-inside
-level
-anything
-continued
-give
-james
-##3
-military
-established
-non
-returned
-feel
-does
-title
-written
-thing
-feet
-william
-far
-co
-association
-hard
-already
-2002
-##ra
-championship
-human
-western
-100
-##na
-department
-hall
-role
-various
-production
-21
-19
-heart
-2001
-living
-fire
-version
-##ers
-##f
-television
-royal
-##4
-produced
-working
-act
-case
-society
-region
-present
-radio
-period
-looking
-least
-total
-keep
-england
-wife
-program
-per
-brother
-mind
-special
-22
-##le
-am
-works
-soon
-##6
-political
-george
-services
-taken
-created
-##7
-further
-able
-reached
-david
-union
-joined
-upon
-done
-important
-social
-information
-either
-##ic
-##x
-appeared
-position
-ground
-lead
-rock
-dark
-election
-23
-board
-france
-hair
-course
-arms
-site
-police
-girl
-instead
-real
-sound
-##v
-words
-moment
-##te
-someone
-##8
-summer
-project
-announced
-san
-less
-wrote
-past
-followed
-##5
-blue
-founded
-al
-finally
-india
-taking
-records
-america
-##ne
-1999
-design
-considered
-northern
-god
-stop
-battle
-toward
-european
-outside
-described
-track
-today
-playing
-language
-28
-call
-26
-heard
-professional
-low
-australia
-miles
-california
-win
-yet
-green
-##ie
-trying
-blood
-##ton
-southern
-science
-maybe
-everything
-match
-square
-27
-mouth
-video
-race
-recorded
-leave
-above
-##9
-daughter
-points
-space
-1998
-museum
-change
-middle
-common
-##0
-move
-tv
-post
-##ta
-lake
-seven
-tried
-elected
-closed
-ten
-paul
-minister
-##th
-months
-start
-chief
-return
-canada
-person
-sea
-release
-similar
-modern
-brought
-rest
-hit
-formed
-mr
-##la
-1997
-floor
-event
-doing
-thomas
-1996
-robert
-care
-killed
-training
-star
-week
-needed
-turn
-finished
-railway
-rather
-news
-health
-sent
-example
-ran
-term
-michael
-coming
-currently
-yes
-forces
-despite
-gold
-areas
-50
-stage
-fact
-29
-dead
-says
-popular
-2018
-originally
-germany
-probably
-developed
-result
-pulled
-friend
-stood
-money
-running
-mi
-signed
-word
-songs
-child
-eventually
-met
-tour
-average
-teams
-minutes
-festival
-current
-deep
-kind
-1995
-decided
-usually
-eastern
-seemed
-##ness
-episode
-bed
-added
-table
-indian
-private
-charles
-route
-available
-idea
-throughout
-centre
-addition
-appointed
-style
-1994
-books
-eight
-construction
-press
-mean
-wall
-friends
-remained
-schools
-study
-##ch
-##um
-institute
-oh
-chinese
-sometimes
-events
-possible
-1992
-australian
-type
-brown
-forward
-talk
-process
-food
-debut
-seat
-performance
-committee
-features
-character
-arts
-herself
-else
-lot
-strong
-russian
-range
-hours
-peter
-arm
-##da
-morning
-dr
-sold
-##ry
-quickly
-directed
-1993
-guitar
-china
-##w
-31
-list
-##ma
-performed
-media
-uk
-players
-smile
-##rs
-myself
-40
-placed
-coach
-province
-towards
-wouldn
-leading
-whole
-boy
-official
-designed
-grand
-census
-##el
-europe
-attack
-japanese
-henry
-1991
-##re
-##os
-cross
-getting
-alone
-action
-lower
-network
-wide
-washington
-japan
-1990
-hospital
-believe
-changed
-sister
-##ar
-hold
-gone
-sir
-hadn
-ship
-##ka
-studies
-academy
-shot
-rights
-below
-base
-bad
-involved
-kept
-largest
-##ist
-bank
-future
-especially
-beginning
-mark
-movement
-section
-female
-magazine
-plan
-professor
-lord
-longer
-##ian
-sat
-walked
-hill
-actually
-civil
-energy
-model
-families
-size
-thus
-aircraft
-completed
-includes
-data
-captain
-##or
-fight
-vocals
-featured
-richard
-bridge
-fourth
-1989
-officer
-stone
-hear
-##ism
-means
-medical
-groups
-management
-self
-lips
-competition
-entire
-lived
-technology
-leaving
-federal
-tournament
-bit
-passed
-hot
-independent
-awards
-kingdom
-mary
-spent
-fine
-doesn
-reported
-##ling
-jack
-fall
-raised
-itself
-stay
-true
-studio
-1988
-sports
-replaced
-paris
-systems
-saint
-leader
-theatre
-whose
-market
-capital
-parents
-spanish
-canadian
-earth
-##ity
-cut
-degree
-writing
-bay
-christian
-awarded
-natural
-higher
-bill
-##as
-coast
-provided
-previous
-senior
-ft
-valley
-organization
-stopped
-onto
-countries
-parts
-conference
-queen
-security
-interest
-saying
-allowed
-master
-earlier
-phone
-matter
-smith
-winning
-try
-happened
-moving
-campaign
-los
-##ley
-breath
-nearly
-mid
-1987
-certain
-girls
-date
-italian
-african
-standing
-fell
-artist
-##ted
-shows
-deal
-mine
-industry
-1986
-##ng
-everyone
-republic
-provide
-collection
-library
-student
-##ville
-primary
-owned
-older
-via
-heavy
-1st
-makes
-##able
-attention
-anyone
-africa
-##ri
-stated
-length
-ended
-fingers
-command
-staff
-skin
-foreign
-opening
-governor
-okay
-medal
-kill
-sun
-cover
-job
-1985
-introduced
-chest
-hell
-feeling
-##ies
-success
-meet
-reason
-standard
-meeting
-novel
-1984
-trade
-source
-buildings
-##land
-rose
-guy
-goal
-##ur
-chapter
-native
-husband
-previously
-unit
-limited
-entered
-weeks
-producer
-operations
-mountain
-takes
-covered
-forced
-related
-roman
-complete
-successful
-key
-texas
-cold
-##ya
-channel
-1980
-traditional
-films
-dance
-clear
-approximately
-500
-nine
-van
-prince
-question
-active
-tracks
-ireland
-regional
-silver
-author
-personal
-sense
-operation
-##ine
-economic
-1983
-holding
-twenty
-isbn
-additional
-speed
-hour
-edition
-regular
-historic
-places
-whom
-shook
-movie
-km²
-secretary
-prior
-report
-chicago
-read
-foundation
-view
-engine
-scored
-1982
-units
-ask
-airport
-property
-ready
-immediately
-lady
-month
-listed
-contract
-##de
-manager
-themselves
-lines
-##ki
-navy
-writer
-meant
-##ts
-runs
-##ro
-practice
-championships
-singer
-glass
-commission
-required
-forest
-starting
-culture
-generally
-giving
-access
-attended
-test
-couple
-stand
-catholic
-martin
-caught
-executive
-##less
-eye
-##ey
-thinking
-chair
-quite
-shoulder
-1979
-hope
-decision
-plays
-defeated
-municipality
-whether
-structure
-offered
-slowly
-pain
-ice
-direction
-##ion
-paper
-mission
-1981
-mostly
-200
-noted
-individual
-managed
-nature
-lives
-plant
-##ha
-helped
-except
-studied
-computer
-figure
-relationship
-issue
-significant
-loss
-die
-smiled
-gun
-ago
-highest
-1972
-##am
-male
-bring
-goals
-mexico
-problem
-distance
-commercial
-completely
-location
-annual
-famous
-drive
-1976
-neck
-1978
-surface
-caused
-italy
-understand
-greek
-highway
-wrong
-hotel
-comes
-appearance
-joseph
-double
-issues
-musical
-companies
-castle
-income
-review
-assembly
-bass
-initially
-parliament
-artists
-experience
-1974
-particular
-walk
-foot
-engineering
-talking
-window
-dropped
-##ter
-miss
-baby
-boys
-break
-1975
-stars
-edge
-remember
-policy
-carried
-train
-stadium
-bar
-sex
-angeles
-evidence
-##ge
-becoming
-assistant
-soviet
-1977
-upper
-step
-wing
-1970
-youth
-financial
-reach
-##ll
-actor
-numerous
-##se
-##st
-nodded
-arrived
-##ation
-minute
-##nt
-believed
-sorry
-complex
-beautiful
-victory
-associated
-temple
-1968
-1973
-chance
-perhaps
-metal
-##son
-1945
-bishop
-##et
-lee
-launched
-particularly
-tree
-le
-retired
-subject
-prize
-contains
-yeah
-theory
-empire
-##ce
-suddenly
-waiting
-trust
-recording
-##to
-happy
-terms
-camp
-champion
-1971
-religious
-pass
-zealand
-names
-2nd
-port
-ancient
-tom
-corner
-represented
-watch
-legal
-anti
-justice
-cause
-watched
-brothers
-45
-material
-changes
-simply
-response
-louis
-fast
-##ting
-answer
-60
-historical
-1969
-stories
-straight
-create
-feature
-increased
-rate
-administration
-virginia
-el
-activities
-cultural
-overall
-winner
-programs
-basketball
-legs
-guard
-beyond
-cast
-doctor
-mm
-flight
-results
-remains
-cost
-effect
-winter
-##ble
-larger
-islands
-problems
-chairman
-grew
-commander
-isn
-1967
-pay
-failed
-selected
-hurt
-fort
-box
-regiment
-majority
-journal
-35
-edward
-plans
-##ke
-##ni
-shown
-pretty
-irish
-characters
-directly
-scene
-likely
-operated
-allow
-spring
-##j
-junior
-matches
-looks
-mike
-houses
-fellow
-##tion
-beach
-marriage
-##ham
-##ive
-rules
-oil
-65
-florida
-expected
-nearby
-congress
-sam
-peace
-recent
-iii
-wait
-subsequently
-cell
-##do
-variety
-serving
-agreed
-please
-poor
-joe
-pacific
-attempt
-wood
-democratic
-piece
-prime
-##ca
-rural
-mile
-touch
-appears
-township
-1964
-1966
-soldiers
-##men
-##ized
-1965
-pennsylvania
-closer
-fighting
-claimed
-score
-jones
-physical
-editor
-##ous
-filled
-genus
-specific
-sitting
-super
-mom
-##va
-therefore
-supported
-status
-fear
-cases
-store
-meaning
-wales
-minor
-spain
-tower
-focus
-vice
-frank
-follow
-parish
-separate
-golden
-horse
-fifth
-remaining
-branch
-32
-presented
-stared
-##id
-uses
-secret
-forms
-##co
-baseball
-exactly
-##ck
-choice
-note
-discovered
-travel
-composed
-truth
-russia
-ball
-color
-kiss
-dad
-wind
-continue
-ring
-referred
-numbers
-digital
-greater
-##ns
-metres
-slightly
-direct
-increase
-1960
-responsible
-crew
-rule
-trees
-troops
-##no
-broke
-goes
-individuals
-hundred
-weight
-creek
-sleep
-memory
-defense
-provides
-ordered
-code
-value
-jewish
-windows
-1944
-safe
-judge
-whatever
-corps
-realized
-growing
-pre
-##ga
-cities
-alexander
-gaze
-lies
-spread
-scott
-letter
-showed
-situation
-mayor
-transport
-watching
-workers
-extended
-##li
-expression
-normal
-##ment
-chart
-multiple
-border
-##ba
-host
-##ner
-daily
-mrs
-walls
-piano
-##ko
-heat
-cannot
-##ate
-earned
-products
-drama
-era
-authority
-seasons
-join
-grade
-##io
-sign
-difficult
-machine
-1963
-territory
-mainly
-##wood
-stations
-squadron
-1962
-stepped
-iron
-19th
-##led
-serve
-appear
-sky
-speak
-broken
-charge
-knowledge
-kilometres
-removed
-ships
-article
-campus
-simple
-##ty
-pushed
-britain
-##ve
-leaves
-recently
-cd
-soft
-boston
-latter
-easy
-acquired
-poland
-##sa
-quality
-officers
-presence
-planned
-nations
-mass
-broadcast
-jean
-share
-image
-influence
-wild
-offer
-emperor
-electric
-reading
-headed
-ability
-promoted
-yellow
-ministry
-1942
-throat
-smaller
-politician
-##by
-latin
-spoke
-cars
-williams
-males
-lack
-pop
-80
-##ier
-acting
-seeing
-consists
-##ti
-estate
-1961
-pressure
-johnson
-newspaper
-jr
-chris
-olympics
-online
-conditions
-beat
-elements
-walking
-vote
-##field
-needs
-carolina
-text
-featuring
-global
-block
-shirt
-levels
-francisco
-purpose
-females
-et
-dutch
-duke
-ahead
-gas
-twice
-safety
-serious
-turning
-highly
-lieutenant
-firm
-maria
-amount
-mixed
-daniel
-proposed
-perfect
-agreement
-affairs
-3rd
-seconds
-contemporary
-paid
-1943
-prison
-save
-kitchen
-label
-administrative
-intended
-constructed
-academic
-nice
-teacher
-races
-1956
-formerly
-corporation
-ben
-nation
-issued
-shut
-1958
-drums
-housing
-victoria
-seems
-opera
-1959
-graduated
-function
-von
-mentioned
-picked
-build
-recognized
-shortly
-protection
-picture
-notable
-exchange
-elections
-1980s
-loved
-percent
-racing
-fish
-elizabeth
-garden
-volume
-hockey
-1941
-beside
-settled
-##ford
-1940
-competed
-replied
-drew
-1948
-actress
-marine
-scotland
-steel
-glanced
-farm
-steve
-1957
-risk
-tonight
-positive
-magic
-singles
-effects
-gray
-screen
-dog
-##ja
-residents
-bus
-sides
-none
-secondary
-literature
-polish
-destroyed
-flying
-founder
-households
-1939
-lay
-reserve
-usa
-gallery
-##ler
-1946
-industrial
-younger
-approach
-appearances
-urban
-ones
-1950
-finish
-avenue
-powerful
-fully
-growth
-page
-honor
-jersey
-projects
-advanced
-revealed
-basic
-90
-infantry
-pair
-equipment
-visit
-33
-evening
-search
-grant
-effort
-solo
-treatment
-buried
-republican
-primarily
-bottom
-owner
-1970s
-israel
-gives
-jim
-dream
-bob
-remain
-spot
-70
-notes
-produce
-champions
-contact
-ed
-soul
-accepted
-ways
-del
-##ally
-losing
-split
-price
-capacity
-basis
-trial
-questions
-##ina
-1955
-20th
-guess
-officially
-memorial
-naval
-initial
-##ization
-whispered
-median
-engineer
-##ful
-sydney
-##go
-columbia
-strength
-300
-1952
-tears
-senate
-00
-card
-asian
-agent
-1947
-software
-44
-draw
-warm
-supposed
-com
-pro
-##il
-transferred
-leaned
-##at
-candidate
-escape
-mountains
-asia
-potential
-activity
-entertainment
-seem
-traffic
-jackson
-murder
-36
-slow
-product
-orchestra
-haven
-agency
-bbc
-taught
-website
-comedy
-unable
-storm
-planning
-albums
-rugby
-environment
-scientific
-grabbed
-protect
-##hi
-boat
-typically
-1954
-1953
-damage
-principal
-divided
-dedicated
-mount
-ohio
-##berg
-pick
-fought
-driver
-##der
-empty
-shoulders
-sort
-thank
-berlin
-prominent
-account
-freedom
-necessary
-efforts
-alex
-headquarters
-follows
-alongside
-des
-simon
-andrew
-suggested
-operating
-learning
-steps
-1949
-sweet
-technical
-begin
-easily
-34
-teeth
-speaking
-settlement
-scale
-##sh
-renamed
-ray
-max
-enemy
-semi
-joint
-compared
-##rd
-scottish
-leadership
-analysis
-offers
-georgia
-pieces
-captured
-animal
-deputy
-guest
-organized
-##lin
-tony
-combined
-method
-challenge
-1960s
-huge
-wants
-battalion
-sons
-rise
-crime
-types
-facilities
-telling
-path
-1951
-platform
-sit
-1990s
-##lo
-tells
-assigned
-rich
-pull
-##ot
-commonly
-alive
-##za
-letters
-concept
-conducted
-wearing
-happen
-bought
-becomes
-holy
-gets
-ocean
-defeat
-languages
-purchased
-coffee
-occurred
-titled
-##q
-declared
-applied
-sciences
-concert
-sounds
-jazz
-brain
-##me
-painting
-fleet
-tax
-nick
-##ius
-michigan
-count
-animals
-leaders
-episodes
-##line
-content
-##den
-birth
-##it
-clubs
-64
-palace
-critical
-refused
-fair
-leg
-laughed
-returning
-surrounding
-participated
-formation
-lifted
-pointed
-connected
-rome
-medicine
-laid
-taylor
-santa
-powers
-adam
-tall
-shared
-focused
-knowing
-yards
-entrance
-falls
-##wa
-calling
-##ad
-sources
-chosen
-beneath
-resources
-yard
-##ite
-nominated
-silence
-zone
-defined
-##que
-gained
-thirty
-38
-bodies
-moon
-##ard
-adopted
-christmas
-widely
-register
-apart
-iran
-premier
-serves
-du
-unknown
-parties
-##les
-generation
-##ff
-continues
-quick
-fields
-brigade
-quiet
-teaching
-clothes
-impact
-weapons
-partner
-flat
-theater
-supreme
-1938
-37
-relations
-##tor
-plants
-suffered
-1936
-wilson
-kids
-begins
-##age
-1918
-seats
-armed
-internet
-models
-worth
-laws
-400
-communities
-classes
-background
-knows
-thanks
-quarter
-reaching
-humans
-carry
-killing
-format
-kong
-hong
-setting
-75
-architecture
-disease
-railroad
-inc
-possibly
-wish
-arthur
-thoughts
-harry
-doors
-density
-##di
-crowd
-illinois
-stomach
-tone
-unique
-reports
-anyway
-##ir
-liberal
-der
-vehicle
-thick
-dry
-drug
-faced
-largely
-facility
-theme
-holds
-creation
-strange
-colonel
-##mi
-revolution
-bell
-politics
-turns
-silent
-rail
-relief
-independence
-combat
-shape
-write
-determined
-sales
-learned
-4th
-finger
-oxford
-providing
-1937
-heritage
-fiction
-situated
-designated
-allowing
-distribution
-hosted
-##est
-sight
-interview
-estimated
-reduced
-##ria
-toronto
-footballer
-keeping
-guys
-damn
-claim
-motion
-sport
-sixth
-stayed
-##ze
-en
-rear
-receive
-handed
-twelve
-dress
-audience
-granted
-brazil
-##well
-spirit
-##ated
-noticed
-etc
-olympic
-representative
-eric
-tight
-trouble
-reviews
-drink
-vampire
-missing
-roles
-ranked
-newly
-household
-finals
-wave
-critics
-##ee
-phase
-massachusetts
-pilot
-unlike
-philadelphia
-bright
-guns
-crown
-organizations
-roof
-42
-respectively
-clearly
-tongue
-marked
-circle
-fox
-korea
-bronze
-brian
-expanded
-sexual
-supply
-yourself
-inspired
-labour
-fc
-##ah
-reference
-vision
-draft
-connection
-brand
-reasons
-1935
-classic
-driving
-trip
-jesus
-cells
-entry
-1920
-neither
-trail
-claims
-atlantic
-orders
-labor
-nose
-afraid
-identified
-intelligence
-calls
-cancer
-attacked
-passing
-stephen
-positions
-imperial
-grey
-jason
-39
-sunday
-48
-swedish
-avoid
-extra
-uncle
-message
-covers
-allows
-surprise
-materials
-fame
-hunter
-##ji
-1930
-citizens
-figures
-davis
-environmental
-confirmed
-shit
-titles
-di
-performing
-difference
-acts
-attacks
-##ov
-existing
-votes
-opportunity
-nor
-shop
-entirely
-trains
-opposite
-pakistan
-##pa
-develop
-resulted
-representatives
-actions
-reality
-pressed
-##ish
-barely
-wine
-conversation
-faculty
-northwest
-ends
-documentary
-nuclear
-stock
-grace
-sets
-eat
-alternative
-##ps
-bag
-resulting
-creating
-surprised
-cemetery
-1919
-drop
-finding
-sarah
-cricket
-streets
-tradition
-ride
-1933
-exhibition
-target
-ear
-explained
-rain
-composer
-injury
-apartment
-municipal
-educational
-occupied
-netherlands
-clean
-billion
-constitution
-learn
-1914
-maximum
-classical
-francis
-lose
-opposition
-jose
-ontario
-bear
-core
-hills
-rolled
-ending
-drawn
-permanent
-fun
-##tes
-##lla
-lewis
-sites
-chamber
-ryan
-##way
-scoring
-height
-1934
-##house
-lyrics
-staring
-55
-officials
-1917
-snow
-oldest
-##tic
-orange
-##ger
-qualified
-interior
-apparently
-succeeded
-thousand
-dinner
-lights
-existence
-fans
-heavily
-41
-greatest
-conservative
-send
-bowl
-plus
-enter
-catch
-##un
-economy
-duty
-1929
-speech
-authorities
-princess
-performances
-versions
-shall
-graduate
-pictures
-effective
-remembered
-poetry
-desk
-crossed
-starring
-starts
-passenger
-sharp
-##ant
-acres
-ass
-weather
-falling
-rank
-fund
-supporting
-check
-adult
-publishing
-heads
-cm
-southeast
-lane
-##burg
-application
-bc
-##ura
-les
-condition
-transfer
-prevent
-display
-ex
-regions
-earl
-federation
-cool
-relatively
-answered
-besides
-1928
-obtained
-portion
-##town
-mix
-##ding
-reaction
-liked
-dean
-express
-peak
-1932
-##tte
-counter
-religion
-chain
-rare
-miller
-convention
-aid
-lie
-vehicles
-mobile
-perform
-squad
-wonder
-lying
-crazy
-sword
-##ping
-attempted
-centuries
-weren
-philosophy
-category
-##ize
-anna
-interested
-47
-sweden
-wolf
-frequently
-abandoned
-kg
-literary
-alliance
-task
-entitled
-##ay
-threw
-promotion
-factory
-tiny
-soccer
-visited
-matt
-fm
-achieved
-52
-defence
-internal
-persian
-43
-methods
-##ging
-arrested
-otherwise
-cambridge
-programming
-villages
-elementary
-districts
-rooms
-criminal
-conflict
-worry
-trained
-1931
-attempts
-waited
-signal
-bird
-truck
-subsequent
-programme
-##ol
-ad
-49
-communist
-details
-faith
-sector
-patrick
-carrying
-laugh
-##ss
-controlled
-korean
-showing
-origin
-fuel
-evil
-1927
-##ent
-brief
-identity
-darkness
-address
-pool
-missed
-publication
-web
-planet
-ian
-anne
-wings
-invited
-##tt
-briefly
-standards
-kissed
-##be
-ideas
-climate
-causing
-walter
-worse
-albert
-articles
-winners
-desire
-aged
-northeast
-dangerous
-gate
-doubt
-1922
-wooden
-multi
-##ky
-poet
-rising
-funding
-46
-communications
-communication
-violence
-copies
-prepared
-ford
-investigation
-skills
-1924
-pulling
-electronic
-##ak
-##ial
-##han
-containing
-ultimately
-offices
-singing
-understanding
-restaurant
-tomorrow
-fashion
-christ
-ward
-da
-pope
-stands
-5th
-flow
-studios
-aired
-commissioned
-contained
-exist
-fresh
-americans
-##per
-wrestling
-approved
-kid
-employed
-respect
-suit
-1925
-angel
-asking
-increasing
-frame
-angry
-selling
-1950s
-thin
-finds
-##nd
-temperature
-statement
-ali
-explain
-inhabitants
-towns
-extensive
-narrow
-51
-jane
-flowers
-images
-promise
-somewhere
-object
-fly
-closely
-##ls
-1912
-bureau
-cape
-1926
-weekly
-presidential
-legislative
-1921
-##ai
-##au
-launch
-founding
-##ny
-978
-##ring
-artillery
-strike
-un
-institutions
-roll
-writers
-landing
-chose
-kevin
-anymore
-pp
-##ut
-attorney
-fit
-dan
-billboard
-receiving
-agricultural
-breaking
-sought
-dave
-admitted
-lands
-mexican
-##bury
-charlie
-specifically
-hole
-iv
-howard
-credit
-moscow
-roads
-accident
-1923
-proved
-wear
-struck
-hey
-guards
-stuff
-slid
-expansion
-1915
-cat
-anthony
-##kin
-melbourne
-opposed
-sub
-southwest
-architect
-failure
-plane
-1916
-##ron
-map
-camera
-tank
-listen
-regarding
-wet
-introduction
-metropolitan
-link
-ep
-fighter
-inch
-grown
-gene
-anger
-fixed
-buy
-dvd
-khan
-domestic
-worldwide
-chapel
-mill
-functions
-examples
-##head
-developing
-1910
-turkey
-hits
-pocket
-antonio
-papers
-grow
-unless
-circuit
-18th
-concerned
-attached
-journalist
-selection
-journey
-converted
-provincial
-painted
-hearing
-aren
-bands
-negative
-aside
-wondered
-knight
-lap
-survey
-ma
-##ow
-noise
-billy
-##ium
-shooting
-guide
-bedroom
-priest
-resistance
-motor
-homes
-sounded
-giant
-##mer
-150
-scenes
-equal
-comic
-patients
-hidden
-solid
-actual
-bringing
-afternoon
-touched
-funds
-wedding
-consisted
-marie
-canal
-sr
-kim
-treaty
-turkish
-recognition
-residence
-cathedral
-broad
-knees
-incident
-shaped
-fired
-norwegian
-handle
-cheek
-contest
-represent
-##pe
-representing
-beauty
-##sen
-birds
-advantage
-emergency
-wrapped
-drawing
-notice
-pink
-broadcasting
-##ong
-somehow
-bachelor
-seventh
-collected
-registered
-establishment
-alan
-assumed
-chemical
-personnel
-roger
-retirement
-jeff
-portuguese
-wore
-tied
-device
-threat
-progress
-advance
-##ised
-banks
-hired
-manchester
-nfl
-teachers
-structures
-forever
-##bo
-tennis
-helping
-saturday
-sale
-applications
-junction
-hip
-incorporated
-neighborhood
-dressed
-ceremony
-##ds
-influenced
-hers
-visual
-stairs
-decades
-inner
-kansas
-hung
-hoped
-gain
-scheduled
-downtown
-engaged
-austria
-clock
-norway
-certainly
-pale
-protected
-1913
-victor
-employees
-plate
-putting
-surrounded
-##ists
-finishing
-blues
-tropical
-##ries
-minnesota
-consider
-philippines
-accept
-54
-retrieved
-1900
-concern
-anderson
-properties
-institution
-gordon
-successfully
-vietnam
-##dy
-backing
-outstanding
-muslim
-crossing
-folk
-producing
-usual
-demand
-occurs
-observed
-lawyer
-educated
-##ana
-kelly
-string
-pleasure
-budget
-items
-quietly
-colorado
-philip
-typical
-##worth
-derived
-600
-survived
-asks
-mental
-##ide
-56
-jake
-jews
-distinguished
-ltd
-1911
-sri
-extremely
-53
-athletic
-loud
-thousands
-worried
-shadow
-transportation
-horses
-weapon
-arena
-importance
-users
-tim
-objects
-contributed
-dragon
-douglas
-aware
-senator
-johnny
-jordan
-sisters
-engines
-flag
-investment
-samuel
-shock
-capable
-clark
-row
-wheel
-refers
-session
-familiar
-biggest
-wins
-hate
-maintained
-drove
-hamilton
-request
-expressed
-injured
-underground
-churches
-walker
-wars
-tunnel
-passes
-stupid
-agriculture
-softly
-cabinet
-regarded
-joining
-indiana
-##ea
-##ms
-push
-dates
-spend
-behavior
-woods
-protein
-gently
-chase
-morgan
-mention
-burning
-wake
-combination
-occur
-mirror
-leads
-jimmy
-indeed
-impossible
-singapore
-paintings
-covering
-##nes
-soldier
-locations
-attendance
-sell
-historian
-wisconsin
-invasion
-argued
-painter
-diego
-changing
-egypt
-##don
-experienced
-inches
-##ku
-missouri
-vol
-grounds
-spoken
-switzerland
-##gan
-reform
-rolling
-ha
-forget
-massive
-resigned
-burned
-allen
-tennessee
-locked
-values
-improved
-##mo
-wounded
-universe
-sick
-dating
-facing
-pack
-purchase
-user
-##pur
-moments
-##ul
-merged
-anniversary
-1908
-coal
-brick
-understood
-causes
-dynasty
-queensland
-establish
-stores
-crisis
-promote
-hoping
-views
-cards
-referee
-extension
-##si
-raise
-arizona
-improve
-colonial
-formal
-charged
-##rt
-palm
-lucky
-hide
-rescue
-faces
-95
-feelings
-candidates
-juan
-##ell
-goods
-6th
-courses
-weekend
-59
-luke
-cash
-fallen
-##om
-delivered
-affected
-installed
-carefully
-tries
-swiss
-hollywood
-costs
-lincoln
-responsibility
-##he
-shore
-file
-proper
-normally
-maryland
-assistance
-jump
-constant
-offering
-friendly
-waters
-persons
-realize
-contain
-trophy
-800
-partnership
-factor
-58
-musicians
-cry
-bound
-oregon
-indicated
-hero
-houston
-medium
-##ure
-consisting
-somewhat
-##ara
-57
-cycle
-##che
-beer
-moore
-frederick
-gotten
-eleven
-worst
-weak
-approached
-arranged
-chin
-loan
-universal
-bond
-fifteen
-pattern
-disappeared
-##ney
-translated
-##zed
-lip
-arab
-capture
-interests
-insurance
-##chi
-shifted
-cave
-prix
-warning
-sections
-courts
-coat
-plot
-smell
-feed
-golf
-favorite
-maintain
-knife
-vs
-voted
-degrees
-finance
-quebec
-opinion
-translation
-manner
-ruled
-operate
-productions
-choose
-musician
-discovery
-confused
-tired
-separated
-stream
-techniques
-committed
-attend
-ranking
-kings
-throw
-passengers
-measure
-horror
-fan
-mining
-sand
-danger
-salt
-calm
-decade
-dam
-require
-runner
-##ik
-rush
-associate
-greece
-##ker
-rivers
-consecutive
-matthew
-##ski
-sighed
-sq
-documents
-steam
-edited
-closing
-tie
-accused
-1905
-##ini
-islamic
-distributed
-directors
-organisation
-bruce
-7th
-breathing
-mad
-lit
-arrival
-concrete
-taste
-08
-composition
-shaking
-faster
-amateur
-adjacent
-stating
-1906
-twin
-flew
-##ran
-tokyo
-publications
-##tone
-obviously
-ridge
-storage
-1907
-carl
-pages
-concluded
-desert
-driven
-universities
-ages
-terminal
-sequence
-borough
-250
-constituency
-creative
-cousin
-economics
-dreams
-margaret
-notably
-reduce
-montreal
-mode
-17th
-ears
-saved
-jan
-vocal
-##ica
-1909
-andy
-##jo
-riding
-roughly
-threatened
-##ise
-meters
-meanwhile
-landed
-compete
-repeated
-grass
-czech
-regularly
-charges
-tea
-sudden
-appeal
-##ung
-solution
-describes
-pierre
-classification
-glad
-parking
-##ning
-belt
-physics
-99
-rachel
-add
-hungarian
-participate
-expedition
-damaged
-gift
-childhood
-85
-fifty
-##red
-mathematics
-jumped
-letting
-defensive
-mph
-##ux
-##gh
-testing
-##hip
-hundreds
-shoot
-owners
-matters
-smoke
-israeli
-kentucky
-dancing
-mounted
-grandfather
-emma
-designs
-profit
-argentina
-##gs
-truly
-li
-lawrence
-cole
-begun
-detroit
-willing
-branches
-smiling
-decide
-miami
-enjoyed
-recordings
-##dale
-poverty
-ethnic
-gay
-##bi
-gary
-arabic
-09
-accompanied
-##one
-##ons
-fishing
-determine
-residential
-acid
-##ary
-alice
-returns
-starred
-mail
-##ang
-jonathan
-strategy
-##ue
-net
-forty
-cook
-businesses
-equivalent
-commonwealth
-distinct
-ill
-##cy
-seriously
-##ors
-##ped
-shift
-harris
-replace
-rio
-imagine
-formula
-ensure
-##ber
-additionally
-scheme
-conservation
-occasionally
-purposes
-feels
-favor
-##and
-##ore
-1930s
-contrast
-hanging
-hunt
-movies
-1904
-instruments
-victims
-danish
-christopher
-busy
-demon
-sugar
-earliest
-colony
-studying
-balance
-duties
-##ks
-belgium
-slipped
-carter
-05
-visible
-stages
-iraq
-fifa
-##im
-commune
-forming
-zero
-07
-continuing
-talked
-counties
-legend
-bathroom
-option
-tail
-clay
-daughters
-afterwards
-severe
-jaw
-visitors
-##ded
-devices
-aviation
-russell
-kate
-##vi
-entering
-subjects
-##ino
-temporary
-swimming
-forth
-smooth
-ghost
-audio
-bush
-operates
-rocks
-movements
-signs
-eddie
-##tz
-ann
-voices
-honorary
-06
-memories
-dallas
-pure
-measures
-racial
-promised
-66
-harvard
-ceo
-16th
-parliamentary
-indicate
-benefit
-flesh
-dublin
-louisiana
-1902
-1901
-patient
-sleeping
-1903
-membership
-coastal
-medieval
-wanting
-element
-scholars
-rice
-62
-limit
-survive
-makeup
-rating
-definitely
-collaboration
-obvious
-##tan
-boss
-ms
-baron
-birthday
-linked
-soil
-diocese
-##lan
-ncaa
-##mann
-offensive
-shell
-shouldn
-waist
-##tus
-plain
-ross
-organ
-resolution
-manufacturing
-adding
-relative
-kennedy
-98
-whilst
-moth
-marketing
-gardens
-crash
-72
-heading
-partners
-credited
-carlos
-moves
-cable
-##zi
-marshall
-##out
-depending
-bottle
-represents
-rejected
-responded
-existed
-04
-jobs
-denmark
-lock
-##ating
-treated
-graham
-routes
-talent
-commissioner
-drugs
-secure
-tests
-reign
-restored
-photography
-##gi
-contributions
-oklahoma
-designer
-disc
-grin
-seattle
-robin
-paused
-atlanta
-unusual
-##gate
-praised
-las
-laughing
-satellite
-hungary
-visiting
-##sky
-interesting
-factors
-deck
-poems
-norman
-##water
-stuck
-speaker
-rifle
-domain
-premiered
-##her
-dc
-comics
-actors
-01
-reputation
-eliminated
-8th
-ceiling
-prisoners
-script
-##nce
-leather
-austin
-mississippi
-rapidly
-admiral
-parallel
-charlotte
-guilty
-tools
-gender
-divisions
-fruit
-##bs
-laboratory
-nelson
-fantasy
-marry
-rapid
-aunt
-tribe
-requirements
-aspects
-suicide
-amongst
-adams
-bone
-ukraine
-abc
-kick
-sees
-edinburgh
-clothing
-column
-rough
-gods
-hunting
-broadway
-gathered
-concerns
-##ek
-spending
-ty
-12th
-snapped
-requires
-solar
-bones
-cavalry
-##tta
-iowa
-drinking
-waste
-index
-franklin
-charity
-thompson
-stewart
-tip
-flash
-landscape
-friday
-enjoy
-singh
-poem
-listening
-##back
-eighth
-fred
-differences
-adapted
-bomb
-ukrainian
-surgery
-corporate
-masters
-anywhere
-##more
-waves
-odd
-sean
-portugal
-orleans
-dick
-debate
-kent
-eating
-puerto
-cleared
-96
-expect
-cinema
-97
-guitarist
-blocks
-electrical
-agree
-involving
-depth
-dying
-panel
-struggle
-##ged
-peninsula
-adults
-novels
-emerged
-vienna
-metro
-debuted
-shoes
-tamil
-songwriter
-meets
-prove
-beating
-instance
-heaven
-scared
-sending
-marks
-artistic
-passage
-superior
-03
-significantly
-shopping
-##tive
-retained
-##izing
-malaysia
-technique
-cheeks
-##ola
-warren
-maintenance
-destroy
-extreme
-allied
-120
-appearing
-##yn
-fill
-advice
-alabama
-qualifying
-policies
-cleveland
-hat
-battery
-smart
-authors
-10th
-soundtrack
-acted
-dated
-lb
-glance
-equipped
-coalition
-funny
-outer
-ambassador
-roy
-possibility
-couples
-campbell
-dna
-loose
-ethan
-supplies
-1898
-gonna
-88
-monster
-##res
-shake
-agents
-frequency
-springs
-dogs
-practices
-61
-gang
-plastic
-easier
-suggests
-gulf
-blade
-exposed
-colors
-industries
-markets
-pan
-nervous
-electoral
-charts
-legislation
-ownership
-##idae
-mac
-appointment
-shield
-copy
-assault
-socialist
-abbey
-monument
-license
-throne
-employment
-jay
-93
-replacement
-charter
-cloud
-powered
-suffering
-accounts
-oak
-connecticut
-strongly
-wright
-colour
-crystal
-13th
-context
-welsh
-networks
-voiced
-gabriel
-jerry
-##cing
-forehead
-mp
-##ens
-manage
-schedule
-totally
-remix
-##ii
-forests
-occupation
-print
-nicholas
-brazilian
-strategic
-vampires
-engineers
-76
-roots
-seek
-correct
-instrumental
-und
-alfred
-backed
-hop
-##des
-stanley
-robinson
-traveled
-wayne
-welcome
-austrian
-achieve
-67
-exit
-rates
-1899
-strip
-whereas
-##cs
-sing
-deeply
-adventure
-bobby
-rick
-jamie
-careful
-components
-cap
-useful
-personality
-knee
-##shi
-pushing
-hosts
-02
-protest
-ca
-ottoman
-symphony
-##sis
-63
-boundary
-1890
-processes
-considering
-considerable
-tons
-##work
-##ft
-##nia
-cooper
-trading
-dear
-conduct
-91
-illegal
-apple
-revolutionary
-holiday
-definition
-harder
-##van
-jacob
-circumstances
-destruction
-##lle
-popularity
-grip
-classified
-liverpool
-donald
-baltimore
-flows
-seeking
-honour
-approval
-92
-mechanical
-till
-happening
-statue
-critic
-increasingly
-immediate
-describe
-commerce
-stare
-##ster
-indonesia
-meat
-rounds
-boats
-baker
-orthodox
-depression
-formally
-worn
-naked
-claire
-muttered
-sentence
-11th
-emily
-document
-77
-criticism
-wished
-vessel
-spiritual
-bent
-virgin
-parker
-minimum
-murray
-lunch
-danny
-printed
-compilation
-keyboards
-false
-blow
-belonged
-68
-raising
-78
-cutting
-##board
-pittsburgh
-##up
-9th
-shadows
-81
-hated
-indigenous
-jon
-15th
-barry
-scholar
-ah
-##zer
-oliver
-##gy
-stick
-susan
-meetings
-attracted
-spell
-romantic
-##ver
-ye
-1895
-photo
-demanded
-customers
-##ac
-1896
-logan
-revival
-keys
-modified
-commanded
-jeans
-##ious
-upset
-raw
-phil
-detective
-hiding
-resident
-vincent
-##bly
-experiences
-diamond
-defeating
-coverage
-lucas
-external
-parks
-franchise
-helen
-bible
-successor
-percussion
-celebrated
-il
-lift
-profile
-clan
-romania
-##ied
-mills
-##su
-nobody
-achievement
-shrugged
-fault
-1897
-rhythm
-initiative
-breakfast
-carbon
-700
-69
-lasted
-violent
-74
-wound
-ken
-killer
-gradually
-filmed
-°c
-dollars
-processing
-94
-remove
-criticized
-guests
-sang
-chemistry
-##vin
-legislature
-disney
-##bridge
-uniform
-escaped
-integrated
-proposal
-purple
-denied
-liquid
-karl
-influential
-morris
-nights
-stones
-intense
-experimental
-twisted
-71
-84
-##ld
-pace
-nazi
-mitchell
-ny
-blind
-reporter
-newspapers
-14th
-centers
-burn
-basin
-forgotten
-surviving
-filed
-collections
-monastery
-losses
-manual
-couch
-description
-appropriate
-merely
-tag
-missions
-sebastian
-restoration
-replacing
-triple
-73
-elder
-julia
-warriors
-benjamin
-julian
-convinced
-stronger
-amazing
-declined
-versus
-merchant
-happens
-output
-finland
-bare
-barbara
-absence
-ignored
-dawn
-injuries
-##port
-producers
-##ram
-82
-luis
-##ities
-kw
-admit
-expensive
-electricity
-nba
-exception
-symbol
-##ving
-ladies
-shower
-sheriff
-characteristics
-##je
-aimed
-button
-ratio
-effectively
-summit
-angle
-jury
-bears
-foster
-vessels
-pants
-executed
-evans
-dozen
-advertising
-kicked
-patrol
-1889
-competitions
-lifetime
-principles
-athletics
-##logy
-birmingham
-sponsored
-89
-rob
-nomination
-1893
-acoustic
-##sm
-creature
-longest
-##tra
-credits
-harbor
-dust
-josh
-##so
-territories
-milk
-infrastructure
-completion
-thailand
-indians
-leon
-archbishop
-##sy
-assist
-pitch
-blake
-arrangement
-girlfriend
-serbian
-operational
-hence
-sad
-scent
-fur
-dj
-sessions
-hp
-refer
-rarely
-##ora
-exists
-1892
-##ten
-scientists
-dirty
-penalty
-burst
-portrait
-seed
-79
-pole
-limits
-rival
-1894
-stable
-alpha
-grave
-constitutional
-alcohol
-arrest
-flower
-mystery
-devil
-architectural
-relationships
-greatly
-habitat
-##istic
-larry
-progressive
-remote
-cotton
-##ics
-##ok
-preserved
-reaches
-##ming
-cited
-86
-vast
-scholarship
-decisions
-cbs
-joy
-teach
-1885
-editions
-knocked
-eve
-searching
-partly
-participation
-gap
-animated
-fate
-excellent
-##ett
-na
-87
-alternate
-saints
-youngest
-##ily
-climbed
-##ita
-##tors
-suggest
-##ct
-discussion
-staying
-choir
-lakes
-jacket
-revenue
-nevertheless
-peaked
-instrument
-wondering
-annually
-managing
-neil
-1891
-signing
-terry
-##ice
-apply
-clinical
-brooklyn
-aim
-catherine
-fuck
-farmers
-figured
-ninth
-pride
-hugh
-evolution
-ordinary
-involvement
-comfortable
-shouted
-tech
-encouraged
-taiwan
-representation
-sharing
-##lia
-##em
-panic
-exact
-cargo
-competing
-fat
-cried
-83
-1920s
-occasions
-pa
-cabin
-borders
-utah
-marcus
-##isation
-badly
-muscles
-##ance
-victorian
-transition
-warner
-bet
-permission
-##rin
-slave
-terrible
-similarly
-shares
-seth
-uefa
-possession
-medals
-benefits
-colleges
-lowered
-perfectly
-mall
-transit
-##ye
-##kar
-publisher
-##ened
-harrison
-deaths
-elevation
-##ae
-asleep
-machines
-sigh
-ash
-hardly
-argument
-occasion
-parent
-leo
-decline
-1888
-contribution
-##ua
-concentration
-1000
-opportunities
-hispanic
-guardian
-extent
-emotions
-hips
-mason
-volumes
-bloody
-controversy
-diameter
-steady
-mistake
-phoenix
-identify
-violin
-##sk
-departure
-richmond
-spin
-funeral
-enemies
-1864
-gear
-literally
-connor
-random
-sergeant
-grab
-confusion
-1865
-transmission
-informed
-op
-leaning
-sacred
-suspended
-thinks
-gates
-portland
-luck
-agencies
-yours
-hull
-expert
-muscle
-layer
-practical
-sculpture
-jerusalem
-latest
-lloyd
-statistics
-deeper
-recommended
-warrior
-arkansas
-mess
-supports
-greg
-eagle
-1880
-recovered
-rated
-concerts
-rushed
-##ano
-stops
-eggs
-files
-premiere
-keith
-##vo
-delhi
-turner
-pit
-affair
-belief
-paint
-##zing
-mate
-##ach
-##ev
-victim
-##ology
-withdrew
-bonus
-styles
-fled
-##ud
-glasgow
-technologies
-funded
-nbc
-adaptation
-##ata
-portrayed
-cooperation
-supporters
-judges
-bernard
-justin
-hallway
-ralph
-##ick
-graduating
-controversial
-distant
-continental
-spider
-bite
-##ho
-recognize
-intention
-mixing
-##ese
-egyptian
-bow
-tourism
-suppose
-claiming
-tiger
-dominated
-participants
-vi
-##ru
-nurse
-partially
-tape
-##rum
-psychology
-##rn
-essential
-touring
-duo
-voting
-civilian
-emotional
-channels
-##king
-apparent
-hebrew
-1887
-tommy
-carrier
-intersection
-beast
-hudson
-##gar
-##zo
-lab
-nova
-bench
-discuss
-costa
-##ered
-detailed
-behalf
-drivers
-unfortunately
-obtain
-##lis
-rocky
-##dae
-siege
-friendship
-honey
-##rian
-1861
-amy
-hang
-posted
-governments
-collins
-respond
-wildlife
-preferred
-operator
-##po
-laura
-pregnant
-videos
-dennis
-suspected
-boots
-instantly
-weird
-automatic
-businessman
-alleged
-placing
-throwing
-ph
-mood
-1862
-perry
-venue
-jet
-remainder
-##lli
-##ci
-passion
-biological
-boyfriend
-1863
-dirt
-buffalo
-ron
-segment
-fa
-abuse
-##era
-genre
-thrown
-stroke
-colored
-stress
-exercise
-displayed
-##gen
-struggled
-##tti
-abroad
-dramatic
-wonderful
-thereafter
-madrid
-component
-widespread
-##sed
-tale
-citizen
-todd
-monday
-1886
-vancouver
-overseas
-forcing
-crying
-descent
-##ris
-discussed
-substantial
-ranks
-regime
-1870
-provinces
-switch
-drum
-zane
-ted
-tribes
-proof
-lp
-cream
-researchers
-volunteer
-manor
-silk
-milan
-donated
-allies
-venture
-principle
-delivery
-enterprise
-##ves
-##ans
-bars
-traditionally
-witch
-reminded
-copper
-##uk
-pete
-inter
-links
-colin
-grinned
-elsewhere
-competitive
-frequent
-##oy
-scream
-##hu
-tension
-texts
-submarine
-finnish
-defending
-defend
-pat
-detail
-1884
-affiliated
-stuart
-themes
-villa
-periods
-tool
-belgian
-ruling
-crimes
-answers
-folded
-licensed
-resort
-demolished
-hans
-lucy
-1881
-lion
-traded
-photographs
-writes
-craig
-##fa
-trials
-generated
-beth
-noble
-debt
-percentage
-yorkshire
-erected
-ss
-viewed
-grades
-confidence
-ceased
-islam
-telephone
-retail
-##ible
-chile
-m²
-roberts
-sixteen
-##ich
-commented
-hampshire
-innocent
-dual
-pounds
-checked
-regulations
-afghanistan
-sung
-rico
-liberty
-assets
-bigger
-options
-angels
-relegated
-tribute
-wells
-attending
-leaf
-##yan
-butler
-romanian
-forum
-monthly
-lisa
-patterns
-gmina
-##tory
-madison
-hurricane
-rev
-##ians
-bristol
-##ula
-elite
-valuable
-disaster
-democracy
-awareness
-germans
-freyja
-##ins
-loop
-absolutely
-paying
-populations
-maine
-sole
-prayer
-spencer
-releases
-doorway
-bull
-##ani
-lover
-midnight
-conclusion
-##sson
-thirteen
-lily
-mediterranean
-##lt
-nhl
-proud
-sample
-##hill
-drummer
-guinea
-##ova
-murphy
-climb
-##ston
-instant
-attributed
-horn
-ain
-railways
-steven
-##ao
-autumn
-ferry
-opponent
-root
-traveling
-secured
-corridor
-stretched
-tales
-sheet
-trinity
-cattle
-helps
-indicates
-manhattan
-murdered
-fitted
-1882
-gentle
-grandmother
-mines
-shocked
-vegas
-produces
-##light
-caribbean
-##ou
-belong
-continuous
-desperate
-drunk
-historically
-trio
-waved
-raf
-dealing
-nathan
-bat
-murmured
-interrupted
-residing
-scientist
-pioneer
-harold
-aaron
-##net
-delta
-attempting
-minority
-mini
-believes
-chorus
-tend
-lots
-eyed
-indoor
-load
-shots
-updated
-jail
-##llo
-concerning
-connecting
-wealth
-##ved
-slaves
-arrive
-rangers
-sufficient
-rebuilt
-##wick
-cardinal
-flood
-muhammad
-whenever
-relation
-runners
-moral
-repair
-viewers
-arriving
-revenge
-punk
-assisted
-bath
-fairly
-breathe
-lists
-innings
-illustrated
-whisper
-nearest
-voters
-clinton
-ties
-ultimate
-screamed
-beijing
-lions
-andre
-fictional
-gathering
-comfort
-radar
-suitable
-dismissed
-hms
-ban
-pine
-wrist
-atmosphere
-voivodeship
-bid
-timber
-##ned
-##nan
-giants
-##ane
-cameron
-recovery
-uss
-identical
-categories
-switched
-serbia
-laughter
-noah
-ensemble
-therapy
-peoples
-touching
-##off
-locally
-pearl
-platforms
-everywhere
-ballet
-tables
-lanka
-herbert
-outdoor
-toured
-derek
-1883
-spaces
-contested
-swept
-1878
-exclusive
-slight
-connections
-##dra
-winds
-prisoner
-collective
-bangladesh
-tube
-publicly
-wealthy
-thai
-##ys
-isolated
-select
-##ric
-insisted
-pen
-fortune
-ticket
-spotted
-reportedly
-animation
-enforcement
-tanks
-110
-decides
-wider
-lowest
-owen
-##time
-nod
-hitting
-##hn
-gregory
-furthermore
-magazines
-fighters
-solutions
-##ery
-pointing
-requested
-peru
-reed
-chancellor
-knights
-mask
-worker
-eldest
-flames
-reduction
-1860
-volunteers
-##tis
-reporting
-##hl
-wire
-advisory
-endemic
-origins
-settlers
-pursue
-knock
-consumer
-1876
-eu
-compound
-creatures
-mansion
-sentenced
-ivan
-deployed
-guitars
-frowned
-involves
-mechanism
-kilometers
-perspective
-shops
-maps
-terminus
-duncan
-alien
-fist
-bridges
-##pers
-heroes
-fed
-derby
-swallowed
-##ros
-patent
-sara
-illness
-characterized
-adventures
-slide
-hawaii
-jurisdiction
-##op
-organised
-##side
-adelaide
-walks
-biology
-se
-##ties
-rogers
-swing
-tightly
-boundaries
-##rie
-prepare
-implementation
-stolen
-##sha
-certified
-colombia
-edwards
-garage
-##mm
-recalled
-##ball
-rage
-harm
-nigeria
-breast
-##ren
-furniture
-pupils
-settle
-##lus
-cuba
-balls
-client
-alaska
-21st
-linear
-thrust
-celebration
-latino
-genetic
-terror
-##cia
-##ening
-lightning
-fee
-witness
-lodge
-establishing
-skull
-##ique
-earning
-hood
-##ei
-rebellion
-wang
-sporting
-warned
-missile
-devoted
-activist
-porch
-worship
-fourteen
-package
-1871
-decorated
-##shire
-housed
-##ock
-chess
-sailed
-doctors
-oscar
-joan
-treat
-garcia
-harbour
-jeremy
-##ire
-traditions
-dominant
-jacques
-##gon
-##wan
-relocated
-1879
-amendment
-sized
-companion
-simultaneously
-volleyball
-spun
-acre
-increases
-stopping
-loves
-belongs
-affect
-drafted
-tossed
-scout
-battles
-1875
-filming
-shoved
-munich
-tenure
-vertical
-romance
-pc
-##cher
-argue
-##ical
-craft
-ranging
-www
-opens
-honest
-tyler
-yesterday
-virtual
-##let
-muslims
-reveal
-snake
-immigrants
-radical
-screaming
-speakers
-firing
-saving
-belonging
-ease
-lighting
-prefecture
-blame
-farmer
-hungry
-grows
-rubbed
-beam
-sur
-subsidiary
-##cha
-armenian
-sao
-dropping
-conventional
-##fer
-microsoft
-reply
-qualify
-spots
-1867
-sweat
-festivals
-##ken
-immigration
-physician
-discover
-exposure
-sandy
-explanation
-isaac
-implemented
-##fish
-hart
-initiated
-connect
-stakes
-presents
-heights
-householder
-pleased
-tourist
-regardless
-slip
-closest
-##ction
-surely
-sultan
-brings
-riley
-preparation
-aboard
-slammed
-baptist
-experiment
-ongoing
-interstate
-organic
-playoffs
-##ika
-1877
-130
-##tar
-hindu
-error
-tours
-tier
-plenty
-arrangements
-talks
-trapped
-excited
-sank
-ho
-athens
-1872
-denver
-welfare
-suburb
-athletes
-trick
-diverse
-belly
-exclusively
-yelled
-1868
-##med
-conversion
-##ette
-1874
-internationally
-computers
-conductor
-abilities
-sensitive
-hello
-dispute
-measured
-globe
-rocket
-prices
-amsterdam
-flights
-tigers
-inn
-municipalities
-emotion
-references
-3d
-##mus
-explains
-airlines
-manufactured
-pm
-archaeological
-1873
-interpretation
-devon
-comment
-##ites
-settlements
-kissing
-absolute
-improvement
-suite
-impressed
-barcelona
-sullivan
-jefferson
-towers
-jesse
-julie
-##tin
-##lu
-grandson
-hi
-gauge
-regard
-rings
-interviews
-trace
-raymond
-thumb
-departments
-burns
-serial
-bulgarian
-scores
-demonstrated
-##ix
-1866
-kyle
-alberta
-underneath
-romanized
-##ward
-relieved
-acquisition
-phrase
-cliff
-reveals
-han
-cuts
-merger
-custom
-##dar
-nee
-gilbert
-graduation
-##nts
-assessment
-cafe
-difficulty
-demands
-swung
-democrat
-jennifer
-commons
-1940s
-grove
-##yo
-completing
-focuses
-sum
-substitute
-bearing
-stretch
-reception
-##py
-reflected
-essentially
-destination
-pairs
-##ched
-survival
-resource
-##bach
-promoting
-doubles
-messages
-tear
-##down
-##fully
-parade
-florence
-harvey
-incumbent
-partial
-framework
-900
-pedro
-frozen
-procedure
-olivia
-controls
-##mic
-shelter
-personally
-temperatures
-##od
-brisbane
-tested
-sits
-marble
-comprehensive
-oxygen
-leonard
-##kov
-inaugural
-iranian
-referring
-quarters
-attitude
-##ivity
-mainstream
-lined
-mars
-dakota
-norfolk
-unsuccessful
-##°
-explosion
-helicopter
-congressional
-##sing
-inspector
-bitch
-seal
-departed
-divine
-##ters
-coaching
-examination
-punishment
-manufacturer
-sink
-columns
-unincorporated
-signals
-nevada
-squeezed
-dylan
-dining
-photos
-martial
-manuel
-eighteen
-elevator
-brushed
-plates
-ministers
-ivy
-congregation
-##len
-slept
-specialized
-taxes
-curve
-restricted
-negotiations
-likes
-statistical
-arnold
-inspiration
-execution
-bold
-intermediate
-significance
-margin
-ruler
-wheels
-gothic
-intellectual
-dependent
-listened
-eligible
-buses
-widow
-syria
-earn
-cincinnati
-collapsed
-recipient
-secrets
-accessible
-philippine
-maritime
-goddess
-clerk
-surrender
-breaks
-playoff
-database
-##ified
-##lon
-ideal
-beetle
-aspect
-soap
-regulation
-strings
-expand
-anglo
-shorter
-crosses
-retreat
-tough
-coins
-wallace
-directions
-pressing
-##oon
-shipping
-locomotives
-comparison
-topics
-nephew
-##mes
-distinction
-honors
-travelled
-sierra
-ibn
-##over
-fortress
-sa
-recognised
-carved
-1869
-clients
-##dan
-intent
-##mar
-coaches
-describing
-bread
-##ington
-beaten
-northwestern
-##ona
-merit
-youtube
-collapse
-challenges
-em
-historians
-objective
-submitted
-virus
-attacking
-drake
-assume
-##ere
-diseases
-marc
-stem
-leeds
-##cus
-##ab
-farming
-glasses
-##lock
-visits
-nowhere
-fellowship
-relevant
-carries
-restaurants
-experiments
-101
-constantly
-bases
-targets
-shah
-tenth
-opponents
-verse
-territorial
-##ira
-writings
-corruption
-##hs
-instruction
-inherited
-reverse
-emphasis
-##vic
-employee
-arch
-keeps
-rabbi
-watson
-payment
-uh
-##ala
-nancy
-##tre
-venice
-fastest
-sexy
-banned
-adrian
-properly
-ruth
-touchdown
-dollar
-boards
-metre
-circles
-edges
-favour
-comments
-ok
-travels
-liberation
-scattered
-firmly
-##ular
-holland
-permitted
-diesel
-kenya
-den
-originated
-##ral
-demons
-resumed
-dragged
-rider
-##rus
-servant
-blinked
-extend
-torn
-##ias
-##sey
-input
-meal
-everybody
-cylinder
-kinds
-camps
-##fe
-bullet
-logic
-##wn
-croatian
-evolved
-healthy
-fool
-chocolate
-wise
-preserve
-pradesh
-##ess
-respective
-1850
-##ew
-chicken
-artificial
-gross
-corresponding
-convicted
-cage
-caroline
-dialogue
-##dor
-narrative
-stranger
-mario
-br
-christianity
-failing
-trent
-commanding
-buddhist
-1848
-maurice
-focusing
-yale
-bike
-altitude
-##ering
-mouse
-revised
-##sley
-veteran
-##ig
-pulls
-theology
-crashed
-campaigns
-legion
-##ability
-drag
-excellence
-customer
-cancelled
-intensity
-excuse
-##lar
-liga
-participating
-contributing
-printing
-##burn
-variable
-##rk
-curious
-bin
-legacy
-renaissance
-##my
-symptoms
-binding
-vocalist
-dancer
-##nie
-grammar
-gospel
-democrats
-ya
-enters
-sc
-diplomatic
-hitler
-##ser
-clouds
-mathematical
-quit
-defended
-oriented
-##heim
-fundamental
-hardware
-impressive
-equally
-convince
-confederate
-guilt
-chuck
-sliding
-##ware
-magnetic
-narrowed
-petersburg
-bulgaria
-otto
-phd
-skill
-##ama
-reader
-hopes
-pitcher
-reservoir
-hearts
-automatically
-expecting
-mysterious
-bennett
-extensively
-imagined
-seeds
-monitor
-fix
-##ative
-journalism
-struggling
-signature
-ranch
-encounter
-photographer
-observation
-protests
-##pin
-influences
-##hr
-calendar
-##all
-cruz
-croatia
-locomotive
-hughes
-naturally
-shakespeare
-basement
-hook
-uncredited
-faded
-theories
-approaches
-dare
-phillips
-filling
-fury
-obama
-##ain
-efficient
-arc
-deliver
-min
-raid
-breeding
-inducted
-leagues
-efficiency
-axis
-montana
-eagles
-##ked
-supplied
-instructions
-karen
-picking
-indicating
-trap
-anchor
-practically
-christians
-tomb
-vary
-occasional
-electronics
-lords
-readers
-newcastle
-faint
-innovation
-collect
-situations
-engagement
-160
-claude
-mixture
-##feld
-peer
-tissue
-logo
-lean
-##ration
-°f
-floors
-##ven
-architects
-reducing
-##our
-##ments
-rope
-1859
-ottawa
-##har
-samples
-banking
-declaration
-proteins
-resignation
-francois
-saudi
-advocate
-exhibited
-armor
-twins
-divorce
-##ras
-abraham
-reviewed
-jo
-temporarily
-matrix
-physically
-pulse
-curled
-##ena
-difficulties
-bengal
-usage
-##ban
-annie
-riders
-certificate
-##pi
-holes
-warsaw
-distinctive
-jessica
-##mon
-mutual
-1857
-customs
-circular
-eugene
-removal
-loaded
-mere
-vulnerable
-depicted
-generations
-dame
-heir
-enormous
-lightly
-climbing
-pitched
-lessons
-pilots
-nepal
-ram
-google
-preparing
-brad
-louise
-renowned
-##₂
-liam
-##ably
-plaza
-shaw
-sophie
-brilliant
-bills
-##bar
-##nik
-fucking
-mainland
-server
-pleasant
-seized
-veterans
-jerked
-fail
-beta
-brush
-radiation
-stored
-warmth
-southeastern
-nate
-sin
-raced
-berkeley
-joke
-athlete
-designation
-trunk
-##low
-roland
-qualification
-archives
-heels
-artwork
-receives
-judicial
-reserves
-##bed
-woke
-installation
-abu
-floating
-fake
-lesser
-excitement
-interface
-concentrated
-addressed
-characteristic
-amanda
-saxophone
-monk
-auto
-##bus
-releasing
-egg
-dies
-interaction
-defender
-ce
-outbreak
-glory
-loving
-##bert
-sequel
-consciousness
-http
-awake
-ski
-enrolled
-##ress
-handling
-rookie
-brow
-somebody
-biography
-warfare
-amounts
-contracts
-presentation
-fabric
-dissolved
-challenged
-meter
-psychological
-lt
-elevated
-rally
-accurate
-##tha
-hospitals
-undergraduate
-specialist
-venezuela
-exhibit
-shed
-nursing
-protestant
-fluid
-structural
-footage
-jared
-consistent
-prey
-##ska
-succession
-reflect
-exile
-lebanon
-wiped
-suspect
-shanghai
-resting
-integration
-preservation
-marvel
-variant
-pirates
-sheep
-rounded
-capita
-sailing
-colonies
-manuscript
-deemed
-variations
-clarke
-functional
-emerging
-boxing
-relaxed
-curse
-azerbaijan
-heavyweight
-nickname
-editorial
-rang
-grid
-tightened
-earthquake
-flashed
-miguel
-rushing
-##ches
-improvements
-boxes
-brooks
-180
-consumption
-molecular
-felix
-societies
-repeatedly
-variation
-aids
-civic
-graphics
-professionals
-realm
-autonomous
-receiver
-delayed
-workshop
-militia
-chairs
-trump
-canyon
-##point
-harsh
-extending
-lovely
-happiness
-##jan
-stake
-eyebrows
-embassy
-wellington
-hannah
-##ella
-sony
-corners
-bishops
-swear
-cloth
-contents
-xi
-namely
-commenced
-1854
-stanford
-nashville
-courage
-graphic
-commitment
-garrison
-##bin
-hamlet
-clearing
-rebels
-attraction
-literacy
-cooking
-ruins
-temples
-jenny
-humanity
-celebrate
-hasn
-freight
-sixty
-rebel
-bastard
-##art
-newton
-##ada
-deer
-##ges
-##ching
-smiles
-delaware
-singers
-##ets
-approaching
-assists
-flame
-##ph
-boulevard
-barrel
-planted
-##ome
-pursuit
-##sia
-consequences
-posts
-shallow
-invitation
-rode
-depot
-ernest
-kane
-rod
-concepts
-preston
-topic
-chambers
-striking
-blast
-arrives
-descendants
-montgomery
-ranges
-worlds
-##lay
-##ari
-span
-chaos
-praise
-##ag
-fewer
-1855
-sanctuary
-mud
-fbi
-##ions
-programmes
-maintaining
-unity
-harper
-bore
-handsome
-closure
-tournaments
-thunder
-nebraska
-linda
-facade
-puts
-satisfied
-argentine
-dale
-cork
-dome
-panama
-##yl
-1858
-tasks
-experts
-##ates
-feeding
-equation
-##las
-##ida
-##tu
-engage
-bryan
-##ax
-um
-quartet
-melody
-disbanded
-sheffield
-blocked
-gasped
-delay
-kisses
-maggie
-connects
-##non
-sts
-poured
-creator
-publishers
-##we
-guided
-ellis
-extinct
-hug
-gaining
-##ord
-complicated
-##bility
-poll
-clenched
-investigate
-##use
-thereby
-quantum
-spine
-cdp
-humor
-kills
-administered
-semifinals
-##du
-encountered
-ignore
-##bu
-commentary
-##maker
-bother
-roosevelt
-140
-plains
-halfway
-flowing
-cultures
-crack
-imprisoned
-neighboring
-airline
-##ses
-##view
-##mate
-##ec
-gather
-wolves
-marathon
-transformed
-##ill
-cruise
-organisations
-carol
-punch
-exhibitions
-numbered
-alarm
-ratings
-daddy
-silently
-##stein
-queens
-colours
-impression
-guidance
-liu
-tactical
-##rat
-marshal
-della
-arrow
-##ings
-rested
-feared
-tender
-owns
-bitter
-advisor
-escort
-##ides
-spare
-farms
-grants
-##ene
-dragons
-encourage
-colleagues
-cameras
-##und
-sucked
-pile
-spirits
-prague
-statements
-suspension
-landmark
-fence
-torture
-recreation
-bags
-permanently
-survivors
-pond
-spy
-predecessor
-bombing
-coup
-##og
-protecting
-transformation
-glow
-##lands
-##book
-dug
-priests
-andrea
-feat
-barn
-jumping
-##chen
-##ologist
-##con
-casualties
-stern
-auckland
-pipe
-serie
-revealing
-ba
-##bel
-trevor
-mercy
-spectrum
-yang
-consist
-governing
-collaborated
-possessed
-epic
-comprises
-blew
-shane
-##ack
-lopez
-honored
-magical
-sacrifice
-judgment
-perceived
-hammer
-mtv
-baronet
-tune
-das
-missionary
-sheets
-350
-neutral
-oral
-threatening
-attractive
-shade
-aims
-seminary
-##master
-estates
-1856
-michel
-wounds
-refugees
-manufacturers
-##nic
-mercury
-syndrome
-porter
-##iya
-##din
-hamburg
-identification
-upstairs
-purse
-widened
-pause
-cared
-breathed
-affiliate
-santiago
-prevented
-celtic
-fisher
-125
-recruited
-byzantine
-reconstruction
-farther
-##mp
-diet
-sake
-au
-spite
-sensation
-##ert
-blank
-separation
-105
-##hon
-vladimir
-armies
-anime
-##lie
-accommodate
-orbit
-cult
-sofia
-archive
-##ify
-##box
-founders
-sustained
-disorder
-honours
-northeastern
-mia
-crops
-violet
-threats
-blanket
-fires
-canton
-followers
-southwestern
-prototype
-voyage
-assignment
-altered
-moderate
-protocol
-pistol
-##eo
-questioned
-brass
-lifting
-1852
-math
-authored
-##ual
-doug
-dimensional
-dynamic
-##san
-1851
-pronounced
-grateful
-quest
-uncomfortable
-boom
-presidency
-stevens
-relating
-politicians
-chen
-barrier
-quinn
-diana
-mosque
-tribal
-cheese
-palmer
-portions
-sometime
-chester
-treasure
-wu
-bend
-download
-millions
-reforms
-registration
-##osa
-consequently
-monitoring
-ate
-preliminary
-brandon
-invented
-ps
-eaten
-exterior
-intervention
-ports
-documented
-log
-displays
-lecture
-sally
-favourite
-##itz
-vermont
-lo
-invisible
-isle
-breed
-##ator
-journalists
-relay
-speaks
-backward
-explore
-midfielder
-actively
-stefan
-procedures
-cannon
-blond
-kenneth
-centered
-servants
-chains
-libraries
-malcolm
-essex
-henri
-slavery
-##hal
-facts
-fairy
-coached
-cassie
-cats
-washed
-cop
-##fi
-announcement
-item
-2000s
-vinyl
-activated
-marco
-frontier
-growled
-curriculum
-##das
-loyal
-accomplished
-leslie
-ritual
-kenny
-##00
-vii
-napoleon
-hollow
-hybrid
-jungle
-stationed
-friedrich
-counted
-##ulated
-platinum
-theatrical
-seated
-col
-rubber
-glen
-1840
-diversity
-healing
-extends
-id
-provisions
-administrator
-columbus
-##oe
-tributary
-te
-assured
-org
-##uous
-prestigious
-examined
-lectures
-grammy
-ronald
-associations
-bailey
-allan
-essays
-flute
-believing
-consultant
-proceedings
-travelling
-1853
-kit
-kerala
-yugoslavia
-buddy
-methodist
-##ith
-burial
-centres
-batman
-##nda
-discontinued
-bo
-dock
-stockholm
-lungs
-severely
-##nk
-citing
-manga
-##ugh
-steal
-mumbai
-iraqi
-robot
-celebrity
-bride
-broadcasts
-abolished
-pot
-joel
-overhead
-franz
-packed
-reconnaissance
-johann
-acknowledged
-introduce
-handled
-doctorate
-developments
-drinks
-alley
-palestine
-##nis
-##aki
-proceeded
-recover
-bradley
-grain
-patch
-afford
-infection
-nationalist
-legendary
-##ath
-interchange
-virtually
-gen
-gravity
-exploration
-amber
-vital
-wishes
-powell
-doctrine
-elbow
-screenplay
-##bird
-contribute
-indonesian
-pet
-creates
-##com
-enzyme
-kylie
-discipline
-drops
-manila
-hunger
-##ien
-layers
-suffer
-fever
-bits
-monica
-keyboard
-manages
-##hood
-searched
-appeals
-##bad
-testament
-grande
-reid
-##war
-beliefs
-congo
-##ification
-##dia
-si
-requiring
-##via
-casey
-1849
-regret
-streak
-rape
-depends
-syrian
-sprint
-pound
-tourists
-upcoming
-pub
-##xi
-tense
-##els
-practiced
-echo
-nationwide
-guild
-motorcycle
-liz
-##zar
-chiefs
-desired
-elena
-bye
-precious
-absorbed
-relatives
-booth
-pianist
-##mal
-citizenship
-exhausted
-wilhelm
-##ceae
-##hed
-noting
-quarterback
-urge
-hectares
-##gue
-ace
-holly
-##tal
-blonde
-davies
-parked
-sustainable
-stepping
-twentieth
-airfield
-galaxy
-nest
-chip
-##nell
-tan
-shaft
-paulo
-requirement
-##zy
-paradise
-tobacco
-trans
-renewed
-vietnamese
-##cker
-##ju
-suggesting
-catching
-holmes
-enjoying
-md
-trips
-colt
-holder
-butterfly
-nerve
-reformed
-cherry
-bowling
-trailer
-carriage
-goodbye
-appreciate
-toy
-joshua
-interactive
-enabled
-involve
-##kan
-collar
-determination
-bunch
-facebook
-recall
-shorts
-superintendent
-episcopal
-frustration
-giovanni
-nineteenth
-laser
-privately
-array
-circulation
-##ovic
-armstrong
-deals
-painful
-permit
-discrimination
-##wi
-aires
-retiring
-cottage
-ni
-##sta
-horizon
-ellen
-jamaica
-ripped
-fernando
-chapters
-playstation
-patron
-lecturer
-navigation
-behaviour
-genes
-georgian
-export
-solomon
-rivals
-swift
-seventeen
-rodriguez
-princeton
-independently
-sox
-1847
-arguing
-entity
-casting
-hank
-criteria
-oakland
-geographic
-milwaukee
-reflection
-expanding
-conquest
-dubbed
-##tv
-halt
-brave
-brunswick
-doi
-arched
-curtis
-divorced
-predominantly
-somerset
-streams
-ugly
-zoo
-horrible
-curved
-buenos
-fierce
-dictionary
-vector
-theological
-unions
-handful
-stability
-chan
-punjab
-segments
-##lly
-altar
-ignoring
-gesture
-monsters
-pastor
-##stone
-thighs
-unexpected
-operators
-abruptly
-coin
-compiled
-associates
-improving
-migration
-pin
-##ose
-compact
-collegiate
-reserved
-##urs
-quarterfinals
-roster
-restore
-assembled
-hurry
-oval
-##cies
-1846
-flags
-martha
-##del
-victories
-sharply
-##rated
-argues
-deadly
-neo
-drawings
-symbols
-performer
-##iel
-griffin
-restrictions
-editing
-andrews
-java
-journals
-arabia
-compositions
-dee
-pierce
-removing
-hindi
-casino
-runway
-civilians
-minds
-nasa
-hotels
-##zation
-refuge
-rent
-retain
-potentially
-conferences
-suburban
-conducting
-##tto
-##tions
-##tle
-descended
-massacre
-##cal
-ammunition
-terrain
-fork
-souls
-counts
-chelsea
-durham
-drives
-cab
-##bank
-perth
-realizing
-palestinian
-finn
-simpson
-##dal
-betty
-##ule
-moreover
-particles
-cardinals
-tent
-evaluation
-extraordinary
-##oid
-inscription
-##works
-wednesday
-chloe
-maintains
-panels
-ashley
-trucks
-##nation
-cluster
-sunlight
-strikes
-zhang
-##wing
-dialect
-canon
-##ap
-tucked
-##ws
-collecting
-##mas
-##can
-##sville
-maker
-quoted
-evan
-franco
-aria
-buying
-cleaning
-eva
-closet
-provision
-apollo
-clinic
-rat
-##ez
-necessarily
-ac
-##gle
-##ising
-venues
-flipped
-cent
-spreading
-trustees
-checking
-authorized
-##sco
-disappointed
-##ado
-notion
-duration
-trumpet
-hesitated
-topped
-brussels
-rolls
-theoretical
-hint
-define
-aggressive
-repeat
-wash
-peaceful
-optical
-width
-allegedly
-mcdonald
-strict
-copyright
-##illa
-investors
-mar
-jam
-witnesses
-sounding
-miranda
-michelle
-privacy
-hugo
-harmony
-##pp
-valid
-lynn
-glared
-nina
-102
-headquartered
-diving
-boarding
-gibson
-##ncy
-albanian
-marsh
-routine
-dealt
-enhanced
-er
-intelligent
-substance
-targeted
-enlisted
-discovers
-spinning
-observations
-pissed
-smoking
-rebecca
-capitol
-visa
-varied
-costume
-seemingly
-indies
-compensation
-surgeon
-thursday
-arsenal
-westminster
-suburbs
-rid
-anglican
-##ridge
-knots
-foods
-alumni
-lighter
-fraser
-whoever
-portal
-scandal
-##ray
-gavin
-advised
-instructor
-flooding
-terrorist
-##ale
-teenage
-interim
-senses
-duck
-teen
-thesis
-abby
-eager
-overcome
-##ile
-newport
-glenn
-rises
-shame
-##cc
-prompted
-priority
-forgot
-bomber
-nicolas
-protective
-360
-cartoon
-katherine
-breeze
-lonely
-trusted
-henderson
-richardson
-relax
-banner
-candy
-palms
-remarkable
-##rio
-legends
-cricketer
-essay
-ordained
-edmund
-rifles
-trigger
-##uri
-##away
-sail
-alert
-1830
-audiences
-penn
-sussex
-siblings
-pursued
-indianapolis
-resist
-rosa
-consequence
-succeed
-avoided
-1845
-##ulation
-inland
-##tie
-##nna
-counsel
-profession
-chronicle
-hurried
-##una
-eyebrow
-eventual
-bleeding
-innovative
-cure
-##dom
-committees
-accounting
-con
-scope
-hardy
-heather
-tenor
-gut
-herald
-codes
-tore
-scales
-wagon
-##oo
-luxury
-tin
-prefer
-fountain
-triangle
-bonds
-darling
-convoy
-dried
-traced
-beings
-troy
-accidentally
-slam
-findings
-smelled
-joey
-lawyers
-outcome
-steep
-bosnia
-configuration
-shifting
-toll
-brook
-performers
-lobby
-philosophical
-construct
-shrine
-aggregate
-boot
-cox
-phenomenon
-savage
-insane
-solely
-reynolds
-lifestyle
-##ima
-nationally
-holdings
-consideration
-enable
-edgar
-mo
-mama
-##tein
-fights
-relegation
-chances
-atomic
-hub
-conjunction
-awkward
-reactions
-currency
-finale
-kumar
-underwent
-steering
-elaborate
-gifts
-comprising
-melissa
-veins
-reasonable
-sunshine
-chi
-solve
-trails
-inhabited
-elimination
-ethics
-huh
-ana
-molly
-consent
-apartments
-layout
-marines
-##ces
-hunters
-bulk
-##oma
-hometown
-##wall
-##mont
-cracked
-reads
-neighbouring
-withdrawn
-admission
-wingspan
-damned
-anthology
-lancashire
-brands
-batting
-forgive
-cuban
-awful
-##lyn
-104
-dimensions
-imagination
-##ade
-dante
-##ship
-tracking
-desperately
-goalkeeper
-##yne
-groaned
-workshops
-confident
-burton
-gerald
-milton
-circus
-uncertain
-slope
-copenhagen
-sophia
-fog
-philosopher
-portraits
-accent
-cycling
-varying
-gripped
-larvae
-garrett
-specified
-scotia
-mature
-luther
-kurt
-rap
-##kes
-aerial
-750
-ferdinand
-heated
-es
-transported
-##shan
-safely
-nonetheless
-##orn
-##gal
-motors
-demanding
-##sburg
-startled
-##brook
-ally
-generate
-caps
-ghana
-stained
-demo
-mentions
-beds
-ap
-afterward
-diary
-##bling
-utility
-##iro
-richards
-1837
-conspiracy
-conscious
-shining
-footsteps
-observer
-cyprus
-urged
-loyalty
-developer
-probability
-olive
-upgraded
-gym
-miracle
-insects
-graves
-1844
-ourselves
-hydrogen
-amazon
-katie
-tickets
-poets
-##pm
-planes
-##pan
-prevention
-witnessed
-dense
-jin
-randy
-tang
-warehouse
-monroe
-bang
-archived
-elderly
-investigations
-alec
-granite
-mineral
-conflicts
-controlling
-aboriginal
-carlo
-##zu
-mechanics
-stan
-stark
-rhode
-skirt
-est
-##berry
-bombs
-respected
-##horn
-imposed
-limestone
-deny
-nominee
-memphis
-grabbing
-disabled
-##als
-amusement
-aa
-frankfurt
-corn
-referendum
-varies
-slowed
-disk
-firms
-unconscious
-incredible
-clue
-sue
-##zhou
-twist
-##cio
-joins
-idaho
-chad
-developers
-computing
-destroyer
-103
-mortal
-tucker
-kingston
-choices
-yu
-carson
-1800
-os
-whitney
-geneva
-pretend
-dimension
-staged
-plateau
-maya
-##une
-freestyle
-##bc
-rovers
-hiv
-##ids
-tristan
-classroom
-prospect
-##hus
-honestly
-diploma
-lied
-thermal
-auxiliary
-feast
-unlikely
-iata
-##tel
-morocco
-pounding
-treasury
-lithuania
-considerably
-1841
-dish
-1812
-geological
-matching
-stumbled
-destroying
-marched
-brien
-advances
-cake
-nicole
-belle
-settling
-measuring
-directing
-##mie
-tuesday
-bassist
-capabilities
-stunned
-fraud
-torpedo
-##list
-##phone
-anton
-wisdom
-surveillance
-ruined
-##ulate
-lawsuit
-healthcare
-theorem
-halls
-trend
-aka
-horizontal
-dozens
-acquire
-lasting
-swim
-hawk
-gorgeous
-fees
-vicinity
-decrease
-adoption
-tactics
-##ography
-pakistani
-##ole
-draws
-##hall
-willie
-burke
-heath
-algorithm
-integral
-powder
-elliott
-brigadier
-jackie
-tate
-varieties
-darker
-##cho
-lately
-cigarette
-specimens
-adds
-##ree
-##ensis
-##inger
-exploded
-finalist
-cia
-murders
-wilderness
-arguments
-nicknamed
-acceptance
-onwards
-manufacture
-robertson
-jets
-tampa
-enterprises
-blog
-loudly
-composers
-nominations
-1838
-ai
-malta
-inquiry
-automobile
-hosting
-viii
-rays
-tilted
-grief
-museums
-strategies
-furious
-euro
-equality
-cohen
-poison
-surrey
-wireless
-governed
-ridiculous
-moses
-##esh
-##room
-vanished
-##ito
-barnes
-attract
-morrison
-istanbul
-##iness
-absent
-rotation
-petition
-janet
-##logical
-satisfaction
-custody
-deliberately
-observatory
-comedian
-surfaces
-pinyin
-novelist
-strictly
-canterbury
-oslo
-monks
-embrace
-ibm
-jealous
-photograph
-continent
-dorothy
-marina
-doc
-excess
-holden
-allegations
-explaining
-stack
-avoiding
-lance
-storyline
-majesty
-poorly
-spike
-dos
-bradford
-raven
-travis
-classics
-proven
-voltage
-pillow
-fists
-butt
-1842
-interpreted
-##car
-1839
-gage
-telegraph
-lens
-promising
-expelled
-casual
-collector
-zones
-##min
-silly
-nintendo
-##kh
-##bra
-downstairs
-chef
-suspicious
-afl
-flies
-vacant
-uganda
-pregnancy
-condemned
-lutheran
-estimates
-cheap
-decree
-saxon
-proximity
-stripped
-idiot
-deposits
-contrary
-presenter
-magnus
-glacier
-im
-offense
-edwin
-##ori
-upright
-##long
-bolt
-##ois
-toss
-geographical
-##izes
-environments
-delicate
-marking
-abstract
-xavier
-nails
-windsor
-plantation
-occurring
-equity
-saskatchewan
-fears
-drifted
-sequences
-vegetation
-revolt
-##stic
-1843
-sooner
-fusion
-opposing
-nato
-skating
-1836
-secretly
-ruin
-lease
-##oc
-edit
-##nne
-flora
-anxiety
-ruby
-##ological
-##mia
-tel
-bout
-taxi
-emmy
-frost
-rainbow
-compounds
-foundations
-rainfall
-assassination
-nightmare
-dominican
-##win
-achievements
-deserve
-orlando
-intact
-armenia
-##nte
-calgary
-valentine
-106
-marion
-proclaimed
-theodore
-bells
-courtyard
-thigh
-gonzalez
-console
-troop
-minimal
-monte
-everyday
-##ence
-##if
-supporter
-terrorism
-buck
-openly
-presbyterian
-activists
-carpet
-##iers
-rubbing
-uprising
-##yi
-cute
-conceived
-legally
-##cht
-millennium
-cello
-velocity
-ji
-rescued
-cardiff
-1835
-rex
-concentrate
-senators
-beard
-rendered
-glowing
-battalions
-scouts
-competitors
-sculptor
-catalogue
-arctic
-ion
-raja
-bicycle
-wow
-glancing
-lawn
-##woman
-gentleman
-lighthouse
-publish
-predicted
-calculated
-##val
-variants
-##gne
-strain
-##ui
-winston
-deceased
-##nus
-touchdowns
-brady
-caleb
-sinking
-echoed
-crush
-hon
-blessed
-protagonist
-hayes
-endangered
-magnitude
-editors
-##tine
-estimate
-responsibilities
-##mel
-backup
-laying
-consumed
-sealed
-zurich
-lovers
-frustrated
-##eau
-ahmed
-kicking
-mit
-treasurer
-1832
-biblical
-refuse
-terrified
-pump
-agrees
-genuine
-imprisonment
-refuses
-plymouth
-##hen
-lou
-##nen
-tara
-trembling
-antarctic
-ton
-learns
-##tas
-crap
-crucial
-faction
-atop
-##borough
-wrap
-lancaster
-odds
-hopkins
-erik
-lyon
-##eon
-bros
-##ode
-snap
-locality
-tips
-empress
-crowned
-cal
-acclaimed
-chuckled
-##ory
-clara
-sends
-mild
-towel
-##fl
-##day
-##а
-wishing
-assuming
-interviewed
-##bal
-##die
-interactions
-eden
-cups
-helena
-##lf
-indie
-beck
-##fire
-batteries
-filipino
-wizard
-parted
-##lam
-traces
-##born
-rows
-idol
-albany
-delegates
-##ees
-##sar
-discussions
-##ex
-notre
-instructed
-belgrade
-highways
-suggestion
-lauren
-possess
-orientation
-alexandria
-abdul
-beats
-salary
-reunion
-ludwig
-alright
-wagner
-intimate
-pockets
-slovenia
-hugged
-brighton
-merchants
-cruel
-stole
-trek
-slopes
-repairs
-enrollment
-politically
-underlying
-promotional
-counting
-boeing
-##bb
-isabella
-naming
-##и
-keen
-bacteria
-listing
-separately
-belfast
-ussr
-450
-lithuanian
-anybody
-ribs
-sphere
-martinez
-cock
-embarrassed
-proposals
-fragments
-nationals
-##fs
-##wski
-premises
-fin
-1500
-alpine
-matched
-freely
-bounded
-jace
-sleeve
-##af
-gaming
-pier
-populated
-evident
-##like
-frances
-flooded
-##dle
-frightened
-pour
-trainer
-framed
-visitor
-challenging
-pig
-wickets
-##fold
-infected
-email
-##pes
-arose
-##aw
-reward
-ecuador
-oblast
-vale
-ch
-shuttle
-##usa
-bach
-rankings
-forbidden
-cornwall
-accordance
-salem
-consumers
-bruno
-fantastic
-toes
-machinery
-resolved
-julius
-remembering
-propaganda
-iceland
-bombardment
-tide
-contacts
-wives
-##rah
-concerto
-macdonald
-albania
-implement
-daisy
-tapped
-sudan
-helmet
-angela
-mistress
-##lic
-crop
-sunk
-finest
-##craft
-hostile
-##ute
-##tsu
-boxer
-fr
-paths
-adjusted
-habit
-ballot
-supervision
-soprano
-##zen
-bullets
-wicked
-sunset
-regiments
-disappear
-lamp
-performs
-app
-##gia
-##oa
-rabbit
-digging
-incidents
-entries
-##cion
-dishes
-##oi
-introducing
-##ati
-##fied
-freshman
-slot
-jill
-tackles
-baroque
-backs
-##iest
-lone
-sponsor
-destiny
-altogether
-convert
-##aro
-consensus
-shapes
-demonstration
-basically
-feminist
-auction
-artifacts
-##bing
-strongest
-twitter
-halifax
-2019
-allmusic
-mighty
-smallest
-precise
-alexandra
-viola
-##los
-##ille
-manuscripts
-##illo
-dancers
-ari
-managers
-monuments
-blades
-barracks
-springfield
-maiden
-consolidated
-electron
-##end
-berry
-airing
-wheat
-nobel
-inclusion
-blair
-payments
-geography
-bee
-cc
-eleanor
-react
-##hurst
-afc
-manitoba
-##yu
-su
-lineup
-fitness
-recreational
-investments
-airborne
-disappointment
-##dis
-edmonton
-viewing
-##row
-renovation
-##cast
-infant
-bankruptcy
-roses
-aftermath
-pavilion
-##yer
-carpenter
-withdrawal
-ladder
-##hy
-discussing
-popped
-reliable
-agreements
-rochester
-##abad
-curves
-bombers
-220
-rao
-reverend
-decreased
-choosing
-107
-stiff
-consulting
-naples
-crawford
-tracy
-ka
-ribbon
-cops
-##lee
-crushed
-deciding
-unified
-teenager
-accepting
-flagship
-explorer
-poles
-sanchez
-inspection
-revived
-skilled
-induced
-exchanged
-flee
-locals
-tragedy
-swallow
-loading
-hanna
-demonstrate
-##ela
-salvador
-flown
-contestants
-civilization
-##ines
-wanna
-rhodes
-fletcher
-hector
-knocking
-considers
-##ough
-nash
-mechanisms
-sensed
-mentally
-walt
-unclear
-##eus
-renovated
-madame
-##cks
-crews
-governmental
-##hin
-undertaken
-monkey
-##ben
-##ato
-fatal
-armored
-copa
-caves
-governance
-grasp
-perception
-certification
-froze
-damp
-tugged
-wyoming
-##rg
-##ero
-newman
-##lor
-nerves
-curiosity
-graph
-115
-##ami
-withdraw
-tunnels
-dull
-meredith
-moss
-exhibits
-neighbors
-communicate
-accuracy
-explored
-raiders
-republicans
-secular
-kat
-superman
-penny
-criticised
-##tch
-freed
-update
-conviction
-wade
-ham
-likewise
-delegation
-gotta
-doll
-promises
-technological
-myth
-nationality
-resolve
-convent
-##mark
-sharon
-dig
-sip
-coordinator
-entrepreneur
-fold
-##dine
-capability
-councillor
-synonym
-blown
-swan
-cursed
-1815
-jonas
-haired
-sofa
-canvas
-keeper
-rivalry
-##hart
-rapper
-speedway
-swords
-postal
-maxwell
-estonia
-potter
-recurring
-##nn
-##ave
-errors
-##oni
-cognitive
-1834
-##²
-claws
-nadu
-roberto
-bce
-wrestler
-ellie
-##ations
-infinite
-ink
-##tia
-presumably
-finite
-staircase
-108
-noel
-patricia
-nacional
-##cation
-chill
-eternal
-tu
-preventing
-prussia
-fossil
-limbs
-##logist
-ernst
-frog
-perez
-rene
-##ace
-pizza
-prussian
-##ios
-##vy
-molecules
-regulatory
-answering
-opinions
-sworn
-lengths
-supposedly
-hypothesis
-upward
-habitats
-seating
-ancestors
-drank
-yield
-hd
-synthesis
-researcher
-modest
-##var
-mothers
-peered
-voluntary
-homeland
-##the
-acclaim
-##igan
-static
-valve
-luxembourg
-alto
-carroll
-fe
-receptor
-norton
-ambulance
-##tian
-johnston
-catholics
-depicting
-jointly
-elephant
-gloria
-mentor
-badge
-ahmad
-distinguish
-remarked
-councils
-precisely
-allison
-advancing
-detection
-crowded
-##10
-cooperative
-ankle
-mercedes
-dagger
-surrendered
-pollution
-commit
-subway
-jeffrey
-lesson
-sculptures
-provider
-##fication
-membrane
-timothy
-rectangular
-fiscal
-heating
-teammate
-basket
-particle
-anonymous
-deployment
-##ple
-missiles
-courthouse
-proportion
-shoe
-sec
-##ller
-complaints
-forbes
-blacks
-abandon
-remind
-sizes
-overwhelming
-autobiography
-natalie
-##awa
-risks
-contestant
-countryside
-babies
-scorer
-invaded
-enclosed
-proceed
-hurling
-disorders
-##cu
-reflecting
-continuously
-cruiser
-graduates
-freeway
-investigated
-ore
-deserved
-maid
-blocking
-phillip
-jorge
-shakes
-dove
-mann
-variables
-lacked
-burden
-accompanying
-que
-consistently
-organizing
-provisional
-complained
-endless
-##rm
-tubes
-juice
-georges
-krishna
-mick
-labels
-thriller
-##uch
-laps
-arcade
-sage
-snail
-##table
-shannon
-fi
-laurence
-seoul
-vacation
-presenting
-hire
-churchill
-surprisingly
-prohibited
-savannah
-technically
-##oli
-170
-##lessly
-testimony
-suited
-speeds
-toys
-romans
-mlb
-flowering
-measurement
-talented
-kay
-settings
-charleston
-expectations
-shattered
-achieving
-triumph
-ceremonies
-portsmouth
-lanes
-mandatory
-loser
-stretching
-cologne
-realizes
-seventy
-cornell
-careers
-webb
-##ulating
-americas
-budapest
-ava
-suspicion
-##ison
-yo
-conrad
-##hai
-sterling
-jessie
-rector
-##az
-1831
-transform
-organize
-loans
-christine
-volcanic
-warrant
-slender
-summers
-subfamily
-newer
-danced
-dynamics
-rhine
-proceeds
-heinrich
-gastropod
-commands
-sings
-facilitate
-easter
-ra
-positioned
-responses
-expense
-fruits
-yanked
-imported
-25th
-velvet
-vic
-primitive
-tribune
-baldwin
-neighbourhood
-donna
-rip
-hay
-pr
-##uro
-1814
-espn
-welcomed
-##aria
-qualifier
-glare
-highland
-timing
-##cted
-shells
-eased
-geometry
-louder
-exciting
-slovakia
-##sion
-##iz
-##lot
-savings
-prairie
-##ques
-marching
-rafael
-tonnes
-##lled
-curtain
-preceding
-shy
-heal
-greene
-worthy
-##pot
-detachment
-bury
-sherman
-##eck
-reinforced
-seeks
-bottles
-contracted
-duchess
-outfit
-walsh
-##sc
-mickey
-##ase
-geoffrey
-archer
-squeeze
-dawson
-eliminate
-invention
-##enberg
-neal
-##eth
-stance
-dealer
-coral
-maple
-retire
-polo
-simplified
-##ht
-1833
-hid
-watts
-backwards
-jules
-##oke
-genesis
-mt
-frames
-rebounds
-burma
-woodland
-moist
-santos
-whispers
-drained
-subspecies
-##aa
-streaming
-ulster
-burnt
-correspondence
-maternal
-gerard
-denis
-stealing
-##load
-genius
-duchy
-##oria
-inaugurated
-momentum
-suits
-placement
-sovereign
-clause
-thames
-##hara
-confederation
-reservation
-sketch
-yankees
-lets
-rotten
-charm
-hal
-verses
-ultra
-commercially
-dot
-salon
-citation
-adopt
-winnipeg
-mist
-allocated
-cairo
-##boy
-jenkins
-interference
-objectives
-##wind
-1820
-portfolio
-armoured
-sectors
-##eh
-initiatives
-##world
-integrity
-exercises
-robe
-tap
-ab
-gazed
-##tones
-distracted
-rulers
-111
-favorable
-jerome
-tended
-cart
-factories
-##eri
-diplomat
-valued
-gravel
-charitable
-##try
-calvin
-exploring
-chang
-shepherd
-terrace
-pdf
-pupil
-##ural
-reflects
-ups
-##rch
-governors
-shelf
-depths
-##nberg
-trailed
-crest
-tackle
-##nian
-##ats
-hatred
-##kai
-clare
-makers
-ethiopia
-longtime
-detected
-embedded
-lacking
-slapped
-rely
-thomson
-anticipation
-iso
-morton
-successive
-agnes
-screenwriter
-straightened
-philippe
-playwright
-haunted
-licence
-iris
-intentions
-sutton
-112
-logical
-correctly
-##weight
-branded
-licked
-tipped
-silva
-ricky
-narrator
-requests
-##ents
-greeted
-supernatural
-cow
-##wald
-lung
-refusing
-employer
-strait
-gaelic
-liner
-##piece
-zoe
-sabha
-##mba
-driveway
-harvest
-prints
-bates
-reluctantly
-threshold
-algebra
-ira
-wherever
-coupled
-240
-assumption
-picks
-##air
-designers
-raids
-gentlemen
-##ean
-roller
-blowing
-leipzig
-locks
-screw
-dressing
-strand
-##lings
-scar
-dwarf
-depicts
-##nu
-nods
-##mine
-differ
-boris
-##eur
-yuan
-flip
-##gie
-mob
-invested
-questioning
-applying
-##ture
-shout
-##sel
-gameplay
-blamed
-illustrations
-bothered
-weakness
-rehabilitation
-##of
-##zes
-envelope
-rumors
-miners
-leicester
-subtle
-kerry
-##ico
-ferguson
-##fu
-premiership
-ne
-##cat
-bengali
-prof
-catches
-remnants
-dana
-##rily
-shouting
-presidents
-baltic
-ought
-ghosts
-dances
-sailors
-shirley
-fancy
-dominic
-##bie
-madonna
-##rick
-bark
-buttons
-gymnasium
-ashes
-liver
-toby
-oath
-providence
-doyle
-evangelical
-nixon
-cement
-carnegie
-embarked
-hatch
-surroundings
-guarantee
-needing
-pirate
-essence
-##bee
-filter
-crane
-hammond
-projected
-immune
-percy
-twelfth
-##ult
-regent
-doctoral
-damon
-mikhail
-##ichi
-lu
-critically
-elect
-realised
-abortion
-acute
-screening
-mythology
-steadily
-##fc
-frown
-nottingham
-kirk
-wa
-minneapolis
-##rra
-module
-algeria
-mc
-nautical
-encounters
-surprising
-statues
-availability
-shirts
-pie
-alma
-brows
-munster
-mack
-soup
-crater
-tornado
-sanskrit
-cedar
-explosive
-bordered
-dixon
-planets
-stamp
-exam
-happily
-##bble
-carriers
-kidnapped
-##vis
-accommodation
-emigrated
-##met
-knockout
-correspondent
-violation
-profits
-peaks
-lang
-specimen
-agenda
-ancestry
-pottery
-spelling
-equations
-obtaining
-ki
-linking
-1825
-debris
-asylum
-##20
-buddhism
-teddy
-##ants
-gazette
-##nger
-##sse
-dental
-eligibility
-utc
-fathers
-averaged
-zimbabwe
-francesco
-coloured
-hissed
-translator
-lynch
-mandate
-humanities
-mackenzie
-uniforms
-lin
-##iana
-##gio
-asset
-mhz
-fitting
-samantha
-genera
-wei
-rim
-beloved
-shark
-riot
-entities
-expressions
-indo
-carmen
-slipping
-owing
-abbot
-neighbor
-sidney
-##av
-rats
-recommendations
-encouraging
-squadrons
-anticipated
-commanders
-conquered
-##oto
-donations
-diagnosed
-##mond
-divide
-##iva
-guessed
-decoration
-vernon
-auditorium
-revelation
-conversations
-##kers
-##power
-herzegovina
-dash
-alike
-protested
-lateral
-herman
-accredited
-mg
-##gent
-freeman
-mel
-fiji
-crow
-crimson
-##rine
-livestock
-##pped
-humanitarian
-bored
-oz
-whip
-##lene
-##ali
-legitimate
-alter
-grinning
-spelled
-anxious
-oriental
-wesley
-##nin
-##hole
-carnival
-controller
-detect
-##ssa
-bowed
-educator
-kosovo
-macedonia
-##sin
-occupy
-mastering
-stephanie
-janeiro
-para
-unaware
-nurses
-noon
-135
-cam
-hopefully
-ranger
-combine
-sociology
-polar
-rica
-##eer
-neill
-##sman
-holocaust
-##ip
-doubled
-lust
-1828
-109
-decent
-cooling
-unveiled
-##card
-1829
-nsw
-homer
-chapman
-meyer
-##gin
-dive
-mae
-reagan
-expertise
-##gled
-darwin
-brooke
-sided
-prosecution
-investigating
-comprised
-petroleum
-genres
-reluctant
-differently
-trilogy
-johns
-vegetables
-corpse
-highlighted
-lounge
-pension
-unsuccessfully
-elegant
-aided
-ivory
-beatles
-amelia
-cain
-dubai
-sunny
-immigrant
-babe
-click
-##nder
-underwater
-pepper
-combining
-mumbled
-atlas
-horns
-accessed
-ballad
-physicians
-homeless
-gestured
-rpm
-freak
-louisville
-corporations
-patriots
-prizes
-rational
-warn
-modes
-decorative
-overnight
-din
-troubled
-phantom
-##ort
-monarch
-sheer
-##dorf
-generals
-guidelines
-organs
-addresses
-##zon
-enhance
-curling
-parishes
-cord
-##kie
-linux
-caesar
-deutsche
-bavaria
-##bia
-coleman
-cyclone
-##eria
-bacon
-petty
-##yama
-##old
-hampton
-diagnosis
-1824
-throws
-complexity
-rita
-disputed
-##₃
-pablo
-##sch
-marketed
-trafficking
-##ulus
-examine
-plague
-formats
-##oh
-vault
-faithful
-##bourne
-webster
-##ox
-highlights
-##ient
-##ann
-phones
-vacuum
-sandwich
-modeling
-##gated
-bolivia
-clergy
-qualities
-isabel
-##nas
-##ars
-wears
-screams
-reunited
-annoyed
-bra
-##ancy
-##rate
-differential
-transmitter
-tattoo
-container
-poker
-##och
-excessive
-resides
-cowboys
-##tum
-augustus
-trash
-providers
-statute
-retreated
-balcony
-reversed
-void
-storey
-preceded
-masses
-leap
-laughs
-neighborhoods
-wards
-schemes
-falcon
-santo
-battlefield
-pad
-ronnie
-thread
-lesbian
-venus
-##dian
-beg
-sandstone
-daylight
-punched
-gwen
-analog
-stroked
-wwe
-acceptable
-measurements
-dec
-toxic
-##kel
-adequate
-surgical
-economist
-parameters
-varsity
-##sberg
-quantity
-ella
-##chy
-##rton
-countess
-generating
-precision
-diamonds
-expressway
-ga
-##ı
-1821
-uruguay
-talents
-galleries
-expenses
-scanned
-colleague
-outlets
-ryder
-lucien
-##ila
-paramount
-##bon
-syracuse
-dim
-fangs
-gown
-sweep
-##sie
-toyota
-missionaries
-websites
-##nsis
-sentences
-adviser
-val
-trademark
-spells
-##plane
-patience
-starter
-slim
-##borg
-toe
-incredibly
-shoots
-elliot
-nobility
-##wyn
-cowboy
-endorsed
-gardner
-tendency
-persuaded
-organisms
-emissions
-kazakhstan
-amused
-boring
-chips
-themed
-##hand
-llc
-constantinople
-chasing
-systematic
-guatemala
-borrowed
-erin
-carey
-##hard
-highlands
-struggles
-1810
-##ifying
-##ced
-wong
-exceptions
-develops
-enlarged
-kindergarten
-castro
-##ern
-##rina
-leigh
-zombie
-juvenile
-##most
-consul
-##nar
-sailor
-hyde
-clarence
-intensive
-pinned
-nasty
-useless
-jung
-clayton
-stuffed
-exceptional
-ix
-apostolic
-230
-transactions
-##dge
-exempt
-swinging
-cove
-religions
-##ash
-shields
-dairy
-bypass
-190
-pursuing
-bug
-joyce
-bombay
-chassis
-southampton
-chat
-interact
-redesignated
-##pen
-nascar
-pray
-salmon
-rigid
-regained
-malaysian
-grim
-publicity
-constituted
-capturing
-toilet
-delegate
-purely
-tray
-drift
-loosely
-striker
-weakened
-trinidad
-mitch
-itv
-defines
-transmitted
-ming
-scarlet
-nodding
-fitzgerald
-fu
-narrowly
-sp
-tooth
-standings
-virtue
-##₁
-##wara
-##cting
-chateau
-gloves
-lid
-##nel
-hurting
-conservatory
-##pel
-sinclair
-reopened
-sympathy
-nigerian
-strode
-advocated
-optional
-chronic
-discharge
-##rc
-suck
-compatible
-laurel
-stella
-shi
-fails
-wage
-dodge
-128
-informal
-sorts
-levi
-buddha
-villagers
-##aka
-chronicles
-heavier
-summoned
-gateway
-3000
-eleventh
-jewelry
-translations
-accordingly
-seas
-##ency
-fiber
-pyramid
-cubic
-dragging
-##ista
-caring
-##ops
-android
-contacted
-lunar
-##dt
-kai
-lisbon
-patted
-1826
-sacramento
-theft
-madagascar
-subtropical
-disputes
-ta
-holidays
-piper
-willow
-mare
-cane
-itunes
-newfoundland
-benny
-companions
-dong
-raj
-observe
-roar
-charming
-plaque
-tibetan
-fossils
-enacted
-manning
-bubble
-tina
-tanzania
-##eda
-##hir
-funk
-swamp
-deputies
-cloak
-ufc
-scenario
-par
-scratch
-metals
-anthem
-guru
-engaging
-specially
-##boat
-dialects
-nineteen
-cecil
-duet
-disability
-messenger
-unofficial
-##lies
-defunct
-eds
-moonlight
-drainage
-surname
-puzzle
-honda
-switching
-conservatives
-mammals
-knox
-broadcaster
-sidewalk
-cope
-##ried
-benson
-princes
-peterson
-##sal
-bedford
-sharks
-eli
-wreck
-alberto
-gasp
-archaeology
-lgbt
-teaches
-securities
-madness
-compromise
-waving
-coordination
-davidson
-visions
-leased
-possibilities
-eighty
-jun
-fernandez
-enthusiasm
-assassin
-sponsorship
-reviewer
-kingdoms
-estonian
-laboratories
-##fy
-##nal
-applies
-verb
-celebrations
-##zzo
-rowing
-lightweight
-sadness
-submit
-mvp
-balanced
-dude
-##vas
-explicitly
-metric
-magnificent
-mound
-brett
-mohammad
-mistakes
-irregular
-##hing
-##ass
-sanders
-betrayed
-shipped
-surge
-##enburg
-reporters
-termed
-georg
-pity
-verbal
-bulls
-abbreviated
-enabling
-appealed
-##are
-##atic
-sicily
-sting
-heel
-sweetheart
-bart
-spacecraft
-brutal
-monarchy
-##tter
-aberdeen
-cameo
-diane
-##ub
-survivor
-clyde
-##aries
-complaint
-##makers
-clarinet
-delicious
-chilean
-karnataka
-coordinates
-1818
-panties
-##rst
-pretending
-ar
-dramatically
-kiev
-bella
-tends
-distances
-113
-catalog
-launching
-instances
-telecommunications
-portable
-lindsay
-vatican
-##eim
-angles
-aliens
-marker
-stint
-screens
-bolton
-##rne
-judy
-wool
-benedict
-plasma
-europa
-spark
-imaging
-filmmaker
-swiftly
-##een
-contributor
-##nor
-opted
-stamps
-apologize
-financing
-butter
-gideon
-sophisticated
-alignment
-avery
-chemicals
-yearly
-speculation
-prominence
-professionally
-##ils
-immortal
-institutional
-inception
-wrists
-identifying
-tribunal
-derives
-gains
-##wo
-papal
-preference
-linguistic
-vince
-operative
-brewery
-##ont
-unemployment
-boyd
-##ured
-##outs
-albeit
-prophet
-1813
-bi
-##rr
-##face
-##rad
-quarterly
-asteroid
-cleaned
-radius
-temper
-##llen
-telugu
-jerk
-viscount
-menu
-##ote
-glimpse
-##aya
-yacht
-hawaiian
-baden
-##rl
-laptop
-readily
-##gu
-monetary
-offshore
-scots
-watches
-##yang
-##arian
-upgrade
-needle
-xbox
-lea
-encyclopedia
-flank
-fingertips
-##pus
-delight
-teachings
-confirm
-roth
-beaches
-midway
-winters
-##iah
-teasing
-daytime
-beverly
-gambling
-bonnie
-##backs
-regulated
-clement
-hermann
-tricks
-knot
-##shing
-##uring
-##vre
-detached
-ecological
-owed
-specialty
-byron
-inventor
-bats
-stays
-screened
-unesco
-midland
-trim
-affection
-##ander
-##rry
-jess
-thoroughly
-feedback
-##uma
-chennai
-strained
-heartbeat
-wrapping
-overtime
-pleaded
-##sworth
-mon
-leisure
-oclc
-##tate
-##ele
-feathers
-angelo
-thirds
-nuts
-surveys
-clever
-gill
-commentator
-##dos
-darren
-rides
-gibraltar
-##nc
-##mu
-dissolution
-dedication
-shin
-meals
-saddle
-elvis
-reds
-chaired
-taller
-appreciation
-functioning
-niece
-favored
-advocacy
-robbie
-criminals
-suffolk
-yugoslav
-passport
-constable
-congressman
-hastings
-vera
-##rov
-consecrated
-sparks
-ecclesiastical
-confined
-##ovich
-muller
-floyd
-nora
-1822
-paved
-1827
-cumberland
-ned
-saga
-spiral
-##flow
-appreciated
-yi
-collaborative
-treating
-similarities
-feminine
-finishes
-##ib
-jade
-import
-##nse
-##hot
-champagne
-mice
-securing
-celebrities
-helsinki
-attributes
-##gos
-cousins
-phases
-ache
-lucia
-gandhi
-submission
-vicar
-spear
-shine
-tasmania
-biting
-detention
-constitute
-tighter
-seasonal
-##gus
-terrestrial
-matthews
-##oka
-effectiveness
-parody
-philharmonic
-##onic
-1816
-strangers
-encoded
-consortium
-guaranteed
-regards
-shifts
-tortured
-collision
-supervisor
-inform
-broader
-insight
-theaters
-armour
-emeritus
-blink
-incorporates
-mapping
-##50
-##ein
-handball
-flexible
-##nta
-substantially
-generous
-thief
-##own
-carr
-loses
-1793
-prose
-ucla
-romeo
-generic
-metallic
-realization
-damages
-mk
-commissioners
-zach
-default
-##ther
-helicopters
-lengthy
-stems
-spa
-partnered
-spectators
-rogue
-indication
-penalties
-teresa
-1801
-sen
-##tric
-dalton
-##wich
-irving
-photographic
-##vey
-dell
-deaf
-peters
-excluded
-unsure
-##vable
-patterson
-crawled
-##zio
-resided
-whipped
-latvia
-slower
-ecole
-pipes
-employers
-maharashtra
-comparable
-va
-textile
-pageant
-##gel
-alphabet
-binary
-irrigation
-chartered
-choked
-antoine
-offs
-waking
-supplement
-##wen
-quantities
-demolition
-regain
-locate
-urdu
-folks
-alt
-114
-##mc
-scary
-andreas
-whites
-##ava
-classrooms
-mw
-aesthetic
-publishes
-valleys
-guides
-cubs
-johannes
-bryant
-conventions
-affecting
-##itt
-drain
-awesome
-isolation
-prosecutor
-ambitious
-apology
-captive
-downs
-atmospheric
-lorenzo
-aisle
-beef
-foul
-##onia
-kidding
-composite
-disturbed
-illusion
-natives
-##ffer
-emi
-rockets
-riverside
-wartime
-painters
-adolf
-melted
-##ail
-uncertainty
-simulation
-hawks
-progressed
-meantime
-builder
-spray
-breach
-unhappy
-regina
-russians
-##urg
-determining
-##tation
-tram
-1806
-##quin
-aging
-##12
-1823
-garion
-rented
-mister
-diaz
-terminated
-clip
-1817
-depend
-nervously
-disco
-owe
-defenders
-shiva
-notorious
-disbelief
-shiny
-worcester
-##gation
-##yr
-trailing
-undertook
-islander
-belarus
-limitations
-watershed
-fuller
-overlooking
-utilized
-raphael
-1819
-synthetic
-breakdown
-klein
-##nate
-moaned
-memoir
-lamb
-practicing
-##erly
-cellular
-arrows
-exotic
-##graphy
-witches
-117
-charted
-rey
-hut
-hierarchy
-subdivision
-freshwater
-giuseppe
-aloud
-reyes
-qatar
-marty
-sideways
-utterly
-sexually
-jude
-prayers
-mccarthy
-softball
-blend
-damien
-##gging
-##metric
-wholly
-erupted
-lebanese
-negro
-revenues
-tasted
-comparative
-teamed
-transaction
-labeled
-maori
-sovereignty
-parkway
-trauma
-gran
-malay
-121
-advancement
-descendant
-2020
-buzz
-salvation
-inventory
-symbolic
-##making
-antarctica
-mps
-##gas
-##bro
-mohammed
-myanmar
-holt
-submarines
-tones
-##lman
-locker
-patriarch
-bangkok
-emerson
-remarks
-predators
-kin
-afghan
-confession
-norwich
-rental
-emerge
-advantages
-##zel
-rca
-##hold
-shortened
-storms
-aidan
-##matic
-autonomy
-compliance
-##quet
-dudley
-atp
-##osis
-1803
-motto
-documentation
-summary
-professors
-spectacular
-christina
-archdiocese
-flashing
-innocence
-remake
-##dell
-psychic
-reef
-scare
-employ
-rs
-sticks
-meg
-gus
-leans
-##ude
-accompany
-bergen
-tomas
-##iko
-doom
-wages
-pools
-##nch
-##bes
-breasts
-scholarly
-alison
-outline
-brittany
-breakthrough
-willis
-realistic
-##cut
-##boro
-competitor
-##stan
-pike
-picnic
-icon
-designing
-commercials
-washing
-villain
-skiing
-micro
-costumes
-auburn
-halted
-executives
-##hat
-logistics
-cycles
-vowel
-applicable
-barrett
-exclaimed
-eurovision
-eternity
-ramon
-##umi
-##lls
-modifications
-sweeping
-disgust
-##uck
-torch
-aviv
-ensuring
-rude
-dusty
-sonic
-donovan
-outskirts
-cu
-pathway
-##band
-##gun
-##lines
-disciplines
-acids
-cadet
-paired
-##40
-sketches
-##sive
-marriages
-##⁺
-folding
-peers
-slovak
-implies
-admired
-##beck
-1880s
-leopold
-instinct
-attained
-weston
-megan
-horace
-##ination
-dorsal
-ingredients
-evolutionary
-##its
-complications
-deity
-lethal
-brushing
-levy
-deserted
-institutes
-posthumously
-delivering
-telescope
-coronation
-motivated
-rapids
-luc
-flicked
-pays
-volcano
-tanner
-weighed
-##nica
-crowds
-frankie
-gifted
-addressing
-granddaughter
-winding
-##rna
-constantine
-gomez
-##front
-landscapes
-rudolf
-anthropology
-slate
-werewolf
-##lio
-astronomy
-circa
-rouge
-dreaming
-sack
-knelt
-drowned
-naomi
-prolific
-tracked
-freezing
-herb
-##dium
-agony
-randall
-twisting
-wendy
-deposit
-touches
-vein
-wheeler
-##bbled
-##bor
-batted
-retaining
-tire
-presently
-compare
-specification
-daemon
-nigel
-##grave
-merry
-recommendation
-czechoslovakia
-sandra
-ng
-roma
-##sts
-lambert
-inheritance
-sheikh
-winchester
-cries
-examining
-##yle
-comeback
-cuisine
-nave
-##iv
-ko
-retrieve
-tomatoes
-barker
-polished
-defining
-irene
-lantern
-personalities
-begging
-tract
-swore
-1809
-175
-##gic
-omaha
-brotherhood
-##rley
-haiti
-##ots
-exeter
-##ete
-##zia
-steele
-dumb
-pearson
-210
-surveyed
-elisabeth
-trends
-##ef
-fritz
-##rf
-premium
-bugs
-fraction
-calmly
-viking
-##birds
-tug
-inserted
-unusually
-##ield
-confronted
-distress
-crashing
-brent
-turks
-resign
-##olo
-cambodia
-gabe
-sauce
-##kal
-evelyn
-116
-extant
-clusters
-quarry
-teenagers
-luna
-##lers
-##ister
-affiliation
-drill
-##ashi
-panthers
-scenic
-libya
-anita
-strengthen
-inscriptions
-##cated
-lace
-sued
-judith
-riots
-##uted
-mint
-##eta
-preparations
-midst
-dub
-challenger
-##vich
-mock
-cf
-displaced
-wicket
-breaths
-enables
-schmidt
-analyst
-##lum
-ag
-highlight
-automotive
-axe
-josef
-newark
-sufficiently
-resembles
-50th
-##pal
-flushed
-mum
-traits
-##ante
-commodore
-incomplete
-warming
-titular
-ceremonial
-ethical
-118
-celebrating
-eighteenth
-cao
-lima
-medalist
-mobility
-strips
-snakes
-##city
-miniature
-zagreb
-barton
-escapes
-umbrella
-automated
-doubted
-differs
-cooled
-georgetown
-dresden
-cooked
-fade
-wyatt
-rna
-jacobs
-carlton
-abundant
-stereo
-boost
-madras
-inning
-##hia
-spur
-ip
-malayalam
-begged
-osaka
-groan
-escaping
-charging
-dose
-vista
-##aj
-bud
-papa
-communists
-advocates
-edged
-tri
-##cent
-resemble
-peaking
-necklace
-fried
-montenegro
-saxony
-goose
-glances
-stuttgart
-curator
-recruit
-grocery
-sympathetic
-##tting
-##fort
-127
-lotus
-randolph
-ancestor
-##rand
-succeeding
-jupiter
-1798
-macedonian
-##heads
-hiking
-1808
-handing
-fischer
-##itive
-garbage
-node
-##pies
-prone
-singular
-papua
-inclined
-attractions
-italia
-pouring
-motioned
-grandma
-garnered
-jacksonville
-corp
-ego
-ringing
-aluminum
-##hausen
-ordering
-##foot
-drawer
-traders
-synagogue
-##play
-##kawa
-resistant
-wandering
-fragile
-fiona
-teased
-var
-hardcore
-soaked
-jubilee
-decisive
-exposition
-mercer
-poster
-valencia
-hale
-kuwait
-1811
-##ises
-##wr
-##eed
-tavern
-gamma
-122
-johan
-##uer
-airways
-amino
-gil
-##ury
-vocational
-domains
-torres
-##sp
-generator
-folklore
-outcomes
-##keeper
-canberra
-shooter
-fl
-beams
-confrontation
-##lling
-##gram
-feb
-aligned
-forestry
-pipeline
-jax
-motorway
-conception
-decay
-##tos
-coffin
-##cott
-stalin
-1805
-escorted
-minded
-##nam
-sitcom
-purchasing
-twilight
-veronica
-additions
-passive
-tensions
-straw
-123
-frequencies
-1804
-refugee
-cultivation
-##iate
-christie
-clary
-bulletin
-crept
-disposal
-##rich
-##zong
-processor
-crescent
-##rol
-bmw
-emphasized
-whale
-nazis
-aurora
-##eng
-dwelling
-hauled
-sponsors
-toledo
-mega
-ideology
-theatres
-tessa
-cerambycidae
-saves
-turtle
-cone
-suspects
-kara
-rusty
-yelling
-greeks
-mozart
-shades
-cocked
-participant
-##tro
-shire
-spit
-freeze
-necessity
-##cos
-inmates
-nielsen
-councillors
-loaned
-uncommon
-omar
-peasants
-botanical
-offspring
-daniels
-formations
-jokes
-1794
-pioneers
-sigma
-licensing
-##sus
-wheelchair
-polite
-1807
-liquor
-pratt
-trustee
-##uta
-forewings
-balloon
-##zz
-kilometre
-camping
-explicit
-casually
-shawn
-foolish
-teammates
-nm
-hassan
-carrie
-judged
-satisfy
-vanessa
-knives
-selective
-cnn
-flowed
-##lice
-eclipse
-stressed
-eliza
-mathematician
-cease
-cultivated
-##roy
-commissions
-browns
-##ania
-destroyers
-sheridan
-meadow
-##rius
-minerals
-##cial
-downstream
-clash
-gram
-memoirs
-ventures
-baha
-seymour
-archie
-midlands
-edith
-fare
-flynn
-invite
-canceled
-tiles
-stabbed
-boulder
-incorporate
-amended
-camden
-facial
-mollusk
-unreleased
-descriptions
-yoga
-grabs
-550
-raises
-ramp
-shiver
-##rose
-coined
-pioneering
-tunes
-qing
-warwick
-tops
-119
-melanie
-giles
-##rous
-wandered
-##inal
-annexed
-nov
-30th
-unnamed
-##ished
-organizational
-airplane
-normandy
-stoke
-whistle
-blessing
-violations
-chased
-holders
-shotgun
-##ctic
-outlet
-reactor
-##vik
-tires
-tearing
-shores
-fortified
-mascot
-constituencies
-nc
-columnist
-productive
-tibet
-##rta
-lineage
-hooked
-oct
-tapes
-judging
-cody
-##gger
-hansen
-kashmir
-triggered
-##eva
-solved
-cliffs
-##tree
-resisted
-anatomy
-protesters
-transparent
-implied
-##iga
-injection
-mattress
-excluding
-##mbo
-defenses
-helpless
-devotion
-##elli
-growl
-liberals
-weber
-phenomena
-atoms
-plug
-##iff
-mortality
-apprentice
-howe
-convincing
-aaa
-swimmer
-barber
-leone
-promptly
-sodium
-def
-nowadays
-arise
-##oning
-gloucester
-corrected
-dignity
-norm
-erie
-##ders
-elders
-evacuated
-sylvia
-compression
-##yar
-hartford
-pose
-backpack
-reasoning
-accepts
-24th
-wipe
-millimetres
-marcel
-##oda
-dodgers
-albion
-1790
-overwhelmed
-aerospace
-oaks
-1795
-showcase
-acknowledge
-recovering
-nolan
-ashe
-hurts
-geology
-fashioned
-disappearance
-farewell
-swollen
-shrug
-marquis
-wimbledon
-124
-rue
-1792
-commemorate
-reduces
-experiencing
-inevitable
-calcutta
-intel
-##court
-murderer
-sticking
-fisheries
-imagery
-bloom
-280
-brake
-##inus
-gustav
-hesitation
-memorable
-po
-viral
-beans
-accidents
-tunisia
-antenna
-spilled
-consort
-treatments
-aye
-perimeter
-##gard
-donation
-hostage
-migrated
-banker
-addiction
-apex
-lil
-trout
-##ously
-conscience
-##nova
-rams
-sands
-genome
-passionate
-troubles
-##lets
-##set
-amid
-##ibility
-##ret
-higgins
-exceed
-vikings
-##vie
-payne
-##zan
-muscular
-##ste
-defendant
-sucking
-##wal
-ibrahim
-fuselage
-claudia
-vfl
-europeans
-snails
-interval
-##garh
-preparatory
-statewide
-tasked
-lacrosse
-viktor
-##lation
-angola
-##hra
-flint
-implications
-employs
-teens
-patrons
-stall
-weekends
-barriers
-scrambled
-nucleus
-tehran
-jenna
-parsons
-lifelong
-robots
-displacement
-5000
-##bles
-precipitation
-##gt
-knuckles
-clutched
-1802
-marrying
-ecology
-marx
-accusations
-declare
-scars
-kolkata
-mat
-meadows
-bermuda
-skeleton
-finalists
-vintage
-crawl
-coordinate
-affects
-subjected
-orchestral
-mistaken
-##tc
-mirrors
-dipped
-relied
-260
-arches
-candle
-##nick
-incorporating
-wildly
-fond
-basilica
-owl
-fringe
-rituals
-whispering
-stirred
-feud
-tertiary
-slick
-goat
-honorable
-whereby
-skip
-ricardo
-stripes
-parachute
-adjoining
-submerged
-synthesizer
-##gren
-intend
-positively
-ninety
-phi
-beaver
-partition
-fellows
-alexis
-prohibition
-carlisle
-bizarre
-fraternity
-##bre
-doubts
-icy
-cbc
-aquatic
-sneak
-sonny
-combines
-airports
-crude
-supervised
-spatial
-merge
-alfonso
-##bic
-corrupt
-scan
-undergo
-##ams
-disabilities
-colombian
-comparing
-dolphins
-perkins
-##lish
-reprinted
-unanimous
-bounced
-hairs
-underworld
-midwest
-semester
-bucket
-paperback
-miniseries
-coventry
-demise
-##leigh
-demonstrations
-sensor
-rotating
-yan
-##hler
-arrange
-soils
-##idge
-hyderabad
-labs
-##dr
-brakes
-grandchildren
-##nde
-negotiated
-rover
-ferrari
-continuation
-directorate
-augusta
-stevenson
-counterpart
-gore
-##rda
-nursery
-rican
-ave
-collectively
-broadly
-pastoral
-repertoire
-asserted
-discovering
-nordic
-styled
-fiba
-cunningham
-harley
-middlesex
-survives
-tumor
-tempo
-zack
-aiming
-lok
-urgent
-##rade
-##nto
-devils
-##ement
-contractor
-turin
-##wl
-##ool
-bliss
-repaired
-simmons
-moan
-astronomical
-cr
-negotiate
-lyric
-1890s
-lara
-bred
-clad
-angus
-pbs
-##ience
-engineered
-posed
-##lk
-hernandez
-possessions
-elbows
-psychiatric
-strokes
-confluence
-electorate
-lifts
-campuses
-lava
-alps
-##ep
-##ution
-##date
-physicist
-woody
-##page
-##ographic
-##itis
-juliet
-reformation
-sparhawk
-320
-complement
-suppressed
-jewel
-##½
-floated
-##kas
-continuity
-sadly
-##ische
-inability
-melting
-scanning
-paula
-flour
-judaism
-safer
-vague
-##lm
-solving
-curb
-##stown
-financially
-gable
-bees
-expired
-miserable
-cassidy
-dominion
-1789
-cupped
-145
-robbery
-facto
-amos
-warden
-resume
-tallest
-marvin
-ing
-pounded
-usd
-declaring
-gasoline
-##aux
-darkened
-270
-650
-sophomore
-##mere
-erection
-gossip
-televised
-risen
-dial
-##eu
-pillars
-##link
-passages
-profound
-##tina
-arabian
-ashton
-silicon
-nail
-##ead
-##lated
-##wer
-##hardt
-fleming
-firearms
-ducked
-circuits
-blows
-waterloo
-titans
-##lina
-atom
-fireplace
-cheshire
-financed
-activation
-algorithms
-##zzi
-constituent
-catcher
-cherokee
-partnerships
-sexuality
-platoon
-tragic
-vivian
-guarded
-whiskey
-meditation
-poetic
-##late
-##nga
-##ake
-porto
-listeners
-dominance
-kendra
-mona
-chandler
-factions
-22nd
-salisbury
-attitudes
-derivative
-##ido
-##haus
-intake
-paced
-javier
-illustrator
-barrels
-bias
-cockpit
-burnett
-dreamed
-ensuing
-##anda
-receptors
-someday
-hawkins
-mattered
-##lal
-slavic
-1799
-jesuit
-cameroon
-wasted
-tai
-wax
-lowering
-victorious
-freaking
-outright
-hancock
-librarian
-sensing
-bald
-calcium
-myers
-tablet
-announcing
-barack
-shipyard
-pharmaceutical
-##uan
-greenwich
-flush
-medley
-patches
-wolfgang
-pt
-speeches
-acquiring
-exams
-nikolai
-##gg
-hayden
-kannada
-##type
-reilly
-##pt
-waitress
-abdomen
-devastated
-capped
-pseudonym
-pharmacy
-fulfill
-paraguay
-1796
-clicked
-##trom
-archipelago
-syndicated
-##hman
-lumber
-orgasm
-rejection
-clifford
-lorraine
-advent
-mafia
-rodney
-brock
-##ght
-##used
-##elia
-cassette
-chamberlain
-despair
-mongolia
-sensors
-developmental
-upstream
-##eg
-##alis
-spanning
-165
-trombone
-basque
-seeded
-interred
-renewable
-rhys
-leapt
-revision
-molecule
-##ages
-chord
-vicious
-nord
-shivered
-23rd
-arlington
-debts
-corpus
-sunrise
-bays
-blackburn
-centimetres
-##uded
-shuddered
-gm
-strangely
-gripping
-cartoons
-isabelle
-orbital
-##ppa
-seals
-proving
-##lton
-refusal
-strengthened
-bust
-assisting
-baghdad
-batsman
-portrayal
-mara
-pushes
-spears
-og
-##cock
-reside
-nathaniel
-brennan
-1776
-confirmation
-caucus
-##worthy
-markings
-yemen
-nobles
-ku
-lazy
-viewer
-catalan
-encompasses
-sawyer
-##fall
-sparked
-substances
-patents
-braves
-arranger
-evacuation
-sergio
-persuade
-dover
-tolerance
-penguin
-cum
-jockey
-insufficient
-townships
-occupying
-declining
-plural
-processed
-projection
-puppet
-flanders
-introduces
-liability
-##yon
-gymnastics
-antwerp
-taipei
-hobart
-candles
-jeep
-wes
-observers
-126
-chaplain
-bundle
-glorious
-##hine
-hazel
-flung
-sol
-excavations
-dumped
-stares
-sh
-bangalore
-triangular
-icelandic
-intervals
-expressing
-turbine
-##vers
-songwriting
-crafts
-##igo
-jasmine
-ditch
-rite
-##ways
-entertaining
-comply
-sorrow
-wrestlers
-basel
-emirates
-marian
-rivera
-helpful
-##some
-caution
-downward
-networking
-##atory
-##tered
-darted
-genocide
-emergence
-replies
-specializing
-spokesman
-convenient
-unlocked
-fading
-augustine
-concentrations
-resemblance
-elijah
-investigator
-andhra
-##uda
-promotes
-bean
-##rrell
-fleeing
-wan
-simone
-announcer
-##ame
-##bby
-lydia
-weaver
-132
-residency
-modification
-##fest
-stretches
-##ast
-alternatively
-nat
-lowe
-lacks
-##ented
-pam
-tile
-concealed
-inferior
-abdullah
-residences
-tissues
-vengeance
-##ided
-moisture
-peculiar
-groove
-zip
-bologna
-jennings
-ninja
-oversaw
-zombies
-pumping
-batch
-livingston
-emerald
-installations
-1797
-peel
-nitrogen
-rama
-##fying
-##star
-schooling
-strands
-responding
-werner
-##ost
-lime
-casa
-accurately
-targeting
-##rod
-underway
-##uru
-hemisphere
-lester
-##yard
-occupies
-2d
-griffith
-angrily
-reorganized
-##owing
-courtney
-deposited
-##dd
-##30
-estadio
-##ifies
-dunn
-exiled
-##ying
-checks
-##combe
-##о
-##fly
-successes
-unexpectedly
-blu
-assessed
-##flower
-##ه
-observing
-sacked
-spiders
-kn
-##tail
-mu
-nodes
-prosperity
-audrey
-divisional
-155
-broncos
-tangled
-adjust
-feeds
-erosion
-paolo
-surf
-directory
-snatched
-humid
-admiralty
-screwed
-gt
-reddish
-##nese
-modules
-trench
-lamps
-bind
-leah
-bucks
-competes
-##nz
-##form
-transcription
-##uc
-isles
-violently
-clutching
-pga
-cyclist
-inflation
-flats
-ragged
-unnecessary
-##hian
-stubborn
-coordinated
-harriet
-baba
-disqualified
-330
-insect
-wolfe
-##fies
-reinforcements
-rocked
-duel
-winked
-embraced
-bricks
-##raj
-hiatus
-defeats
-pending
-brightly
-jealousy
-##xton
-##hm
-##uki
-lena
-gdp
-colorful
-##dley
-stein
-kidney
-##shu
-underwear
-wanderers
-##haw
-##icus
-guardians
-m³
-roared
-habits
-##wise
-permits
-gp
-uranium
-punished
-disguise
-bundesliga
-elise
-dundee
-erotic
-partisan
-pi
-collectors
-float
-individually
-rendering
-behavioral
-bucharest
-ser
-hare
-valerie
-corporal
-nutrition
-proportional
-##isa
-immense
-##kis
-pavement
-##zie
-##eld
-sutherland
-crouched
-1775
-##lp
-suzuki
-trades
-endurance
-operas
-crosby
-prayed
-priory
-rory
-socially
-##urn
-gujarat
-##pu
-walton
-cube
-pasha
-privilege
-lennon
-floods
-thorne
-waterfall
-nipple
-scouting
-approve
-##lov
-minorities
-voter
-dwight
-extensions
-assure
-ballroom
-slap
-dripping
-privileges
-rejoined
-confessed
-demonstrating
-patriotic
-yell
-investor
-##uth
-pagan
-slumped
-squares
-##cle
-##kins
-confront
-bert
-embarrassment
-##aid
-aston
-urging
-sweater
-starr
-yuri
-brains
-williamson
-commuter
-mortar
-structured
-selfish
-exports
-##jon
-cds
-##him
-unfinished
-##rre
-mortgage
-destinations
-##nagar
-canoe
-solitary
-buchanan
-delays
-magistrate
-fk
-##pling
-motivation
-##lier
-##vier
-recruiting
-assess
-##mouth
-malik
-antique
-1791
-pius
-rahman
-reich
-tub
-zhou
-smashed
-airs
-galway
-xii
-conditioning
-honduras
-discharged
-dexter
-##pf
-lionel
-129
-debates
-lemon
-tiffany
-volunteered
-dom
-dioxide
-procession
-devi
-sic
-tremendous
-advertisements
-colts
-transferring
-verdict
-hanover
-decommissioned
-utter
-relate
-pac
-racism
-##top
-beacon
-limp
-similarity
-terra
-occurrence
-ant
-##how
-becky
-capt
-updates
-armament
-richie
-pal
-##graph
-halloween
-mayo
-##ssen
-##bone
-cara
-serena
-fcc
-dolls
-obligations
-##dling
-violated
-lafayette
-jakarta
-exploitation
-##ime
-infamous
-iconic
-##lah
-##park
-kitty
-moody
-reginald
-dread
-spill
-crystals
-olivier
-modeled
-bluff
-equilibrium
-separating
-notices
-ordnance
-extinction
-onset
-cosmic
-attachment
-sammy
-expose
-privy
-anchored
-##bil
-abbott
-admits
-bending
-baritone
-emmanuel
-policeman
-vaughan
-winged
-climax
-dresses
-denny
-polytechnic
-mohamed
-burmese
-authentic
-nikki
-genetics
-grandparents
-homestead
-gaza
-postponed
-metacritic
-una
-##sby
-##bat
-unstable
-dissertation
-##rial
-##cian
-curls
-obscure
-uncovered
-bronx
-praying
-disappearing
-##hoe
-prehistoric
-coke
-turret
-mutations
-nonprofit
-pits
-monaco
-##ي
-##usion
-prominently
-dispatched
-podium
-##mir
-uci
-##uation
-133
-fortifications
-birthplace
-kendall
-##lby
-##oll
-preacher
-rack
-goodman
-##rman
-persistent
-##ott
-countless
-jaime
-recorder
-lexington
-persecution
-jumps
-renewal
-wagons
-##11
-crushing
-##holder
-decorations
-##lake
-abundance
-wrath
-laundry
-£1
-garde
-##rp
-jeanne
-beetles
-peasant
-##sl
-splitting
-caste
-sergei
-##rer
-##ema
-scripts
-##ively
-rub
-satellites
-##vor
-inscribed
-verlag
-scrapped
-gale
-packages
-chick
-potato
-slogan
-kathleen
-arabs
-##culture
-counterparts
-reminiscent
-choral
-##tead
-rand
-retains
-bushes
-dane
-accomplish
-courtesy
-closes
-##oth
-slaughter
-hague
-krakow
-lawson
-tailed
-elias
-ginger
-##ttes
-canopy
-betrayal
-rebuilding
-turf
-##hof
-frowning
-allegiance
-brigades
-kicks
-rebuild
-polls
-alias
-nationalism
-td
-rowan
-audition
-bowie
-fortunately
-recognizes
-harp
-dillon
-horrified
-##oro
-renault
-##tics
-ropes
-##α
-presumed
-rewarded
-infrared
-wiping
-accelerated
-illustration
-##rid
-presses
-practitioners
-badminton
-##iard
-detained
-##tera
-recognizing
-relates
-misery
-##sies
-##tly
-reproduction
-piercing
-potatoes
-thornton
-esther
-manners
-hbo
-##aan
-ours
-bullshit
-ernie
-perennial
-sensitivity
-illuminated
-rupert
-##jin
-##iss
-##ear
-rfc
-nassau
-##dock
-staggered
-socialism
-##haven
-appointments
-nonsense
-prestige
-sharma
-haul
-##tical
-solidarity
-gps
-##ook
-##rata
-igor
-pedestrian
-##uit
-baxter
-tenants
-wires
-medication
-unlimited
-guiding
-impacts
-diabetes
-##rama
-sasha
-pas
-clive
-extraction
-131
-continually
-constraints
-##bilities
-sonata
-hunted
-sixteenth
-chu
-planting
-quote
-mayer
-pretended
-abs
-spat
-##hua
-ceramic
-##cci
-curtains
-pigs
-pitching
-##dad
-latvian
-sore
-dayton
-##sted
-##qi
-patrols
-slice
-playground
-##nted
-shone
-stool
-apparatus
-inadequate
-mates
-treason
-##ija
-desires
-##liga
-##croft
-somalia
-laurent
-mir
-leonardo
-oracle
-grape
-obliged
-chevrolet
-thirteenth
-stunning
-enthusiastic
-##ede
-accounted
-concludes
-currents
-basil
-##kovic
-drought
-##rica
-mai
-##aire
-shove
-posting
-##shed
-pilgrimage
-humorous
-packing
-fry
-pencil
-wines
-smells
-144
-marilyn
-aching
-newest
-clung
-bon
-neighbours
-sanctioned
-##pie
-mug
-##stock
-drowning
-##mma
-hydraulic
-##vil
-hiring
-reminder
-lilly
-investigators
-##ncies
-sour
-##eous
-compulsory
-packet
-##rion
-##graphic
-##elle
-cannes
-##inate
-depressed
-##rit
-heroic
-importantly
-theresa
-##tled
-conway
-saturn
-marginal
-rae
-##xia
-corresponds
-royce
-pact
-jasper
-explosives
-packaging
-aluminium
-##ttered
-denotes
-rhythmic
-spans
-assignments
-hereditary
-outlined
-originating
-sundays
-lad
-reissued
-greeting
-beatrice
-##dic
-pillar
-marcos
-plots
-handbook
-alcoholic
-judiciary
-avant
-slides
-extract
-masculine
-blur
-##eum
-##force
-homage
-trembled
-owens
-hymn
-trey
-omega
-signaling
-socks
-accumulated
-reacted
-attic
-theo
-lining
-angie
-distraction
-primera
-talbot
-##key
-1200
-ti
-creativity
-billed
-##hey
-deacon
-eduardo
-identifies
-proposition
-dizzy
-gunner
-hogan
-##yam
-##pping
-##hol
-ja
-##chan
-jensen
-reconstructed
-##berger
-clearance
-darius
-##nier
-abe
-harlem
-plea
-dei
-circled
-emotionally
-notation
-fascist
-neville
-exceeded
-upwards
-viable
-ducks
-##fo
-workforce
-racer
-limiting
-shri
-##lson
-possesses
-1600
-kerr
-moths
-devastating
-laden
-disturbing
-locking
-##cture
-gal
-fearing
-accreditation
-flavor
-aide
-1870s
-mountainous
-##baum
-melt
-##ures
-motel
-texture
-servers
-soda
-##mb
-herd
-##nium
-erect
-puzzled
-hum
-peggy
-examinations
-gould
-testified
-geoff
-ren
-devised
-sacks
-##law
-denial
-posters
-grunted
-cesar
-tutor
-ec
-gerry
-offerings
-byrne
-falcons
-combinations
-ct
-incoming
-pardon
-rocking
-26th
-avengers
-flared
-mankind
-seller
-uttar
-loch
-nadia
-stroking
-exposing
-##hd
-fertile
-ancestral
-instituted
-##has
-noises
-prophecy
-taxation
-eminent
-vivid
-pol
-##bol
-dart
-indirect
-multimedia
-notebook
-upside
-displaying
-adrenaline
-referenced
-geometric
-##iving
-progression
-##ddy
-blunt
-announce
-##far
-implementing
-##lav
-aggression
-liaison
-cooler
-cares
-headache
-plantations
-gorge
-dots
-impulse
-thickness
-ashamed
-averaging
-kathy
-obligation
-precursor
-137
-fowler
-symmetry
-thee
-225
-hears
-##rai
-undergoing
-ads
-butcher
-bowler
-##lip
-cigarettes
-subscription
-goodness
-##ically
-browne
-##hos
-##tech
-kyoto
-donor
-##erty
-damaging
-friction
-drifting
-expeditions
-hardened
-prostitution
-152
-fauna
-blankets
-claw
-tossing
-snarled
-butterflies
-recruits
-investigative
-coated
-healed
-138
-communal
-hai
-xiii
-academics
-boone
-psychologist
-restless
-lahore
-stephens
-mba
-brendan
-foreigners
-printer
-##pc
-ached
-explode
-27th
-deed
-scratched
-dared
-##pole
-cardiac
-1780
-okinawa
-proto
-commando
-compelled
-oddly
-electrons
-##base
-replica
-thanksgiving
-##rist
-sheila
-deliberate
-stafford
-tidal
-representations
-hercules
-ou
-##path
-##iated
-kidnapping
-lenses
-##tling
-deficit
-samoa
-mouths
-consuming
-computational
-maze
-granting
-smirk
-razor
-fixture
-ideals
-inviting
-aiden
-nominal
-##vs
-issuing
-julio
-pitt
-ramsey
-docks
-##oss
-exhaust
-##owed
-bavarian
-draped
-anterior
-mating
-ethiopian
-explores
-noticing
-##nton
-discarded
-convenience
-hoffman
-endowment
-beasts
-cartridge
-mormon
-paternal
-probe
-sleeves
-interfere
-lump
-deadline
-##rail
-jenks
-bulldogs
-scrap
-alternating
-justified
-reproductive
-nam
-seize
-descending
-secretariat
-kirby
-coupe
-grouped
-smash
-panther
-sedan
-tapping
-##18
-lola
-cheer
-germanic
-unfortunate
-##eter
-unrelated
-##fan
-subordinate
-##sdale
-suzanne
-advertisement
-##ility
-horsepower
-##lda
-cautiously
-discourse
-luigi
-##mans
-##fields
-noun
-prevalent
-mao
-schneider
-everett
-surround
-governorate
-kira
-##avia
-westward
-##take
-misty
-rails
-sustainability
-134
-unused
-##rating
-packs
-toast
-unwilling
-regulate
-thy
-suffrage
-nile
-awe
-assam
-definitions
-travelers
-affordable
-##rb
-conferred
-sells
-undefeated
-beneficial
-torso
-basal
-repeating
-remixes
-##pass
-bahrain
-cables
-fang
-##itated
-excavated
-numbering
-statutory
-##rey
-deluxe
-##lian
-forested
-ramirez
-derbyshire
-zeus
-slamming
-transfers
-astronomer
-banana
-lottery
-berg
-histories
-bamboo
-##uchi
-resurrection
-posterior
-bowls
-vaguely
-##thi
-thou
-preserving
-tensed
-offence
-##inas
-meyrick
-callum
-ridden
-watt
-langdon
-tying
-lowland
-snorted
-daring
-truman
-##hale
-##girl
-aura
-overly
-filing
-weighing
-goa
-infections
-philanthropist
-saunders
-eponymous
-##owski
-latitude
-perspectives
-reviewing
-mets
-commandant
-radial
-##kha
-flashlight
-reliability
-koch
-vowels
-amazed
-ada
-elaine
-supper
-##rth
-##encies
-predator
-debated
-soviets
-cola
-##boards
-##nah
-compartment
-crooked
-arbitrary
-fourteenth
-##ctive
-havana
-majors
-steelers
-clips
-profitable
-ambush
-exited
-packers
-##tile
-nude
-cracks
-fungi
-##е
-limb
-trousers
-josie
-shelby
-tens
-frederic
-##ος
-definite
-smoothly
-constellation
-insult
-baton
-discs
-lingering
-##nco
-conclusions
-lent
-staging
-becker
-grandpa
-shaky
-##tron
-einstein
-obstacles
-sk
-adverse
-elle
-economically
-##moto
-mccartney
-thor
-dismissal
-motions
-readings
-nostrils
-treatise
-##pace
-squeezing
-evidently
-prolonged
-1783
-venezuelan
-je
-marguerite
-beirut
-takeover
-shareholders
-##vent
-denise
-digit
-airplay
-norse
-##bbling
-imaginary
-pills
-hubert
-blaze
-vacated
-eliminating
-##ello
-vine
-mansfield
-##tty
-retrospective
-barrow
-borne
-clutch
-bail
-forensic
-weaving
-##nett
-##witz
-desktop
-citadel
-promotions
-worrying
-dorset
-ieee
-subdivided
-##iating
-manned
-expeditionary
-pickup
-synod
-chuckle
-185
-barney
-##rz
-##ffin
-functionality
-karachi
-litigation
-meanings
-uc
-lick
-turbo
-anders
-##ffed
-execute
-curl
-oppose
-ankles
-typhoon
-##د
-##ache
-##asia
-linguistics
-compassion
-pressures
-grazing
-perfection
-##iting
-immunity
-monopoly
-muddy
-backgrounds
-136
-namibia
-francesca
-monitors
-attracting
-stunt
-tuition
-##ии
-vegetable
-##mates
-##quent
-mgm
-jen
-complexes
-forts
-##ond
-cellar
-bites
-seventeenth
-royals
-flemish
-failures
-mast
-charities
-##cular
-peruvian
-capitals
-macmillan
-ipswich
-outward
-frigate
-postgraduate
-folds
-employing
-##ouse
-concurrently
-fiery
-##tai
-contingent
-nightmares
-monumental
-nicaragua
-##kowski
-lizard
-mal
-fielding
-gig
-reject
-##pad
-harding
-##ipe
-coastline
-##cin
-##nos
-beethoven
-humphrey
-innovations
-##tam
-##nge
-norris
-doris
-solicitor
-huang
-obey
-141
-##lc
-niagara
-##tton
-shelves
-aug
-bourbon
-curry
-nightclub
-specifications
-hilton
-##ndo
-centennial
-dispersed
-worm
-neglected
-briggs
-sm
-font
-kuala
-uneasy
-plc
-##nstein
-##bound
-##aking
-##burgh
-awaiting
-pronunciation
-##bbed
-##quest
-eh
-optimal
-zhu
-raped
-greens
-presided
-brenda
-worries
-##life
-venetian
-marxist
-turnout
-##lius
-refined
-braced
-sins
-grasped
-sunderland
-nickel
-speculated
-lowell
-cyrillic
-communism
-fundraising
-resembling
-colonists
-mutant
-freddie
-usc
-##mos
-gratitude
-##run
-mural
-##lous
-chemist
-wi
-reminds
-28th
-steals
-tess
-pietro
-##ingen
-promoter
-ri
-microphone
-honoured
-rai
-sant
-##qui
-feather
-##nson
-burlington
-kurdish
-terrorists
-deborah
-sickness
-##wed
-##eet
-hazard
-irritated
-desperation
-veil
-clarity
-##rik
-jewels
-xv
-##gged
-##ows
-##cup
-berkshire
-unfair
-mysteries
-orchid
-winced
-exhaustion
-renovations
-stranded
-obe
-infinity
-##nies
-adapt
-redevelopment
-thanked
-registry
-olga
-domingo
-noir
-tudor
-ole
-##atus
-commenting
-behaviors
-##ais
-crisp
-pauline
-probable
-stirling
-wigan
-##bian
-paralympics
-panting
-surpassed
-##rew
-luca
-barred
-pony
-famed
-##sters
-cassandra
-waiter
-carolyn
-exported
-##orted
-andres
-destructive
-deeds
-jonah
-castles
-vacancy
-suv
-##glass
-1788
-orchard
-yep
-famine
-belarusian
-sprang
-##forth
-skinny
-##mis
-administrators
-rotterdam
-zambia
-zhao
-boiler
-discoveries
-##ride
-##physics
-lucius
-disappointing
-outreach
-spoon
-##frame
-qualifications
-unanimously
-enjoys
-regency
-##iidae
-stade
-realism
-veterinary
-rodgers
-dump
-alain
-chestnut
-castile
-censorship
-rumble
-gibbs
-##itor
-communion
-reggae
-inactivated
-logs
-loads
-##houses
-homosexual
-##iano
-ale
-informs
-##cas
-phrases
-plaster
-linebacker
-ambrose
-kaiser
-fascinated
-850
-limerick
-recruitment
-forge
-mastered
-##nding
-leinster
-rooted
-threaten
-##strom
-borneo
-##hes
-suggestions
-scholarships
-propeller
-documentaries
-patronage
-coats
-constructing
-invest
-neurons
-comet
-entirety
-shouts
-identities
-annoying
-unchanged
-wary
-##antly
-##ogy
-neat
-oversight
-##kos
-phillies
-replay
-constance
-##kka
-incarnation
-humble
-skies
-minus
-##acy
-smithsonian
-##chel
-guerrilla
-jar
-cadets
-##plate
-surplus
-audit
-##aru
-cracking
-joanna
-louisa
-pacing
-##lights
-intentionally
-##iri
-diner
-nwa
-imprint
-australians
-tong
-unprecedented
-bunker
-naive
-specialists
-ark
-nichols
-railing
-leaked
-pedal
-##uka
-shrub
-longing
-roofs
-v8
-captains
-neural
-tuned
-##ntal
-##jet
-emission
-medina
-frantic
-codex
-definitive
-sid
-abolition
-intensified
-stocks
-enrique
-sustain
-genoa
-oxide
-##written
-clues
-cha
-##gers
-tributaries
-fragment
-venom
-##rity
-##ente
-##sca
-muffled
-vain
-sire
-laos
-##ingly
-##hana
-hastily
-snapping
-surfaced
-sentiment
-motive
-##oft
-contests
-approximate
-mesa
-luckily
-dinosaur
-exchanges
-propelled
-accord
-bourne
-relieve
-tow
-masks
-offended
-##ues
-cynthia
-##mmer
-rains
-bartender
-zinc
-reviewers
-lois
-##sai
-legged
-arrogant
-rafe
-rosie
-comprise
-handicap
-blockade
-inlet
-lagoon
-copied
-drilling
-shelley
-petals
-##inian
-mandarin
-obsolete
-##inated
-onward
-arguably
-productivity
-cindy
-praising
-seldom
-busch
-discusses
-raleigh
-shortage
-ranged
-stanton
-encouragement
-firstly
-conceded
-overs
-temporal
-##uke
-cbe
-##bos
-woo
-certainty
-pumps
-##pton
-stalked
-##uli
-lizzie
-periodic
-thieves
-weaker
-##night
-gases
-shoving
-chooses
-wc
-##chemical
-prompting
-weights
-##kill
-robust
-flanked
-sticky
-hu
-tuberculosis
-##eb
-##eal
-christchurch
-resembled
-wallet
-reese
-inappropriate
-pictured
-distract
-fixing
-fiddle
-giggled
-burger
-heirs
-hairy
-mechanic
-torque
-apache
-obsessed
-chiefly
-cheng
-logging
-##tag
-extracted
-meaningful
-numb
-##vsky
-gloucestershire
-reminding
-##bay
-unite
-##lit
-breeds
-diminished
-clown
-glove
-1860s
-##ن
-##ug
-archibald
-focal
-freelance
-sliced
-depiction
-##yk
-organism
-switches
-sights
-stray
-crawling
-##ril
-lever
-leningrad
-interpretations
-loops
-anytime
-reel
-alicia
-delighted
-##ech
-inhaled
-xiv
-suitcase
-bernie
-vega
-licenses
-northampton
-exclusion
-induction
-monasteries
-racecourse
-homosexuality
-##right
-##sfield
-##rky
-dimitri
-michele
-alternatives
-ions
-commentators
-genuinely
-objected
-pork
-hospitality
-fencing
-stephan
-warships
-peripheral
-wit
-drunken
-wrinkled
-quentin
-spends
-departing
-chung
-numerical
-spokesperson
-##zone
-johannesburg
-caliber
-killers
-##udge
-assumes
-neatly
-demographic
-abigail
-bloc
-##vel
-mounting
-##lain
-bentley
-slightest
-xu
-recipients
-##jk
-merlin
-##writer
-seniors
-prisons
-blinking
-hindwings
-flickered
-kappa
-##hel
-80s
-strengthening
-appealing
-brewing
-gypsy
-mali
-lashes
-hulk
-unpleasant
-harassment
-bio
-treaties
-predict
-instrumentation
-pulp
-troupe
-boiling
-mantle
-##ffe
-ins
-##vn
-dividing
-handles
-verbs
-##onal
-coconut
-senegal
-340
-thorough
-gum
-momentarily
-##sto
-cocaine
-panicked
-destined
-##turing
-teatro
-denying
-weary
-captained
-mans
-##hawks
-##code
-wakefield
-bollywood
-thankfully
-##16
-cyril
-##wu
-amendments
-##bahn
-consultation
-stud
-reflections
-kindness
-1787
-internally
-##ovo
-tex
-mosaic
-distribute
-paddy
-seeming
-143
-##hic
-piers
-##15
-##mura
-##verse
-popularly
-winger
-kang
-sentinel
-mccoy
-##anza
-covenant
-##bag
-verge
-fireworks
-suppress
-thrilled
-dominate
-##jar
-swansea
-##60
-142
-reconciliation
-##ndi
-stiffened
-cue
-dorian
-##uf
-damascus
-amor
-ida
-foremost
-##aga
-porsche
-unseen
-dir
-##had
-##azi
-stony
-lexi
-melodies
-##nko
-angular
-integer
-podcast
-ants
-inherent
-jaws
-justify
-persona
-##olved
-josephine
-##nr
-##ressed
-customary
-flashes
-gala
-cyrus
-glaring
-backyard
-ariel
-physiology
-greenland
-html
-stir
-avon
-atletico
-finch
-methodology
-ked
-##lent
-mas
-catholicism
-townsend
-branding
-quincy
-fits
-containers
-1777
-ashore
-aragon
-##19
-forearm
-poisoning
-##sd
-adopting
-conquer
-grinding
-amnesty
-keller
-finances
-evaluate
-forged
-lankan
-instincts
-##uto
-guam
-bosnian
-photographed
-workplace
-desirable
-protector
-##dog
-allocation
-intently
-encourages
-willy
-##sten
-bodyguard
-electro
-brighter
-##ν
-bihar
-##chev
-lasts
-opener
-amphibious
-sal
-verde
-arte
-##cope
-captivity
-vocabulary
-yields
-##tted
-agreeing
-desmond
-pioneered
-##chus
-strap
-campaigned
-railroads
-##ович
-emblem
-##dre
-stormed
-501
-##ulous
-marijuana
-northumberland
-##gn
-##nath
-bowen
-landmarks
-beaumont
-##qua
-danube
-##bler
-attorneys
-th
-ge
-flyers
-critique
-villains
-cass
-mutation
-acc
-##0s
-colombo
-mckay
-motif
-sampling
-concluding
-syndicate
-##rell
-neon
-stables
-ds
-warnings
-clint
-mourning
-wilkinson
-##tated
-merrill
-leopard
-evenings
-exhaled
-emil
-sonia
-ezra
-discrete
-stove
-farrell
-fifteenth
-prescribed
-superhero
-##rier
-worms
-helm
-wren
-##duction
-##hc
-expo
-##rator
-hq
-unfamiliar
-antony
-prevents
-acceleration
-fiercely
-mari
-painfully
-calculations
-cheaper
-ign
-clifton
-irvine
-davenport
-mozambique
-##np
-pierced
-##evich
-wonders
-##wig
-##cate
-##iling
-crusade
-ware
-##uel
-enzymes
-reasonably
-mls
-##coe
-mater
-ambition
-bunny
-eliot
-kernel
-##fin
-asphalt
-headmaster
-torah
-aden
-lush
-pins
-waived
-##care
-##yas
-joao
-substrate
-enforce
-##grad
-##ules
-alvarez
-selections
-epidemic
-tempted
-##bit
-bremen
-translates
-ensured
-waterfront
-29th
-forrest
-manny
-malone
-kramer
-reigning
-cookies
-simpler
-absorption
-205
-engraved
-##ffy
-evaluated
-1778
-haze
-146
-comforting
-crossover
-##abe
-thorn
-##rift
-##imo
-##pop
-suppression
-fatigue
-cutter
-##tr
-201
-wurttemberg
-##orf
-enforced
-hovering
-proprietary
-gb
-samurai
-syllable
-ascent
-lacey
-tick
-lars
-tractor
-merchandise
-rep
-bouncing
-defendants
-##yre
-huntington
-##ground
-##oko
-standardized
-##hor
-##hima
-assassinated
-nu
-predecessors
-rainy
-liar
-assurance
-lyrical
-##uga
-secondly
-flattened
-ios
-parameter
-undercover
-##mity
-bordeaux
-punish
-ridges
-markers
-exodus
-inactive
-hesitate
-debbie
-nyc
-pledge
-savoy
-nagar
-offset
-organist
-##tium
-hesse
-marin
-converting
-##iver
-diagram
-propulsion
-pu
-validity
-reverted
-supportive
-##dc
-ministries
-clans
-responds
-proclamation
-##inae
-##ø
-##rea
-ein
-pleading
-patriot
-sf
-birch
-islanders
-strauss
-hates
-##dh
-brandenburg
-concession
-rd
-##ob
-1900s
-killings
-textbook
-antiquity
-cinematography
-wharf
-embarrassing
-setup
-creed
-farmland
-inequality
-centred
-signatures
-fallon
-370
-##ingham
-##uts
-ceylon
-gazing
-directive
-laurie
-##tern
-globally
-##uated
-##dent
-allah
-excavation
-threads
-##cross
-148
-frantically
-icc
-utilize
-determines
-respiratory
-thoughtful
-receptions
-##dicate
-merging
-chandra
-seine
-147
-builders
-builds
-diagnostic
-dev
-visibility
-goddamn
-analyses
-dhaka
-cho
-proves
-chancel
-concurrent
-curiously
-canadians
-pumped
-restoring
-1850s
-turtles
-jaguar
-sinister
-spinal
-traction
-declan
-vows
-1784
-glowed
-capitalism
-swirling
-install
-universidad
-##lder
-##oat
-soloist
-##genic
-##oor
-coincidence
-beginnings
-nissan
-dip
-resorts
-caucasus
-combustion
-infectious
-##eno
-pigeon
-serpent
-##itating
-conclude
-masked
-salad
-jew
-##gr
-surreal
-toni
-##wc
-harmonica
-151
-##gins
-##etic
-##coat
-fishermen
-intending
-bravery
-##wave
-klaus
-titan
-wembley
-taiwanese
-ransom
-40th
-incorrect
-hussein
-eyelids
-jp
-cooke
-dramas
-utilities
-##etta
-##print
-eisenhower
-principally
-granada
-lana
-##rak
-openings
-concord
-##bl
-bethany
-connie
-morality
-sega
-##mons
-##nard
-earnings
-##kara
-##cine
-wii
-communes
-##rel
-coma
-composing
-softened
-severed
-grapes
-##17
-nguyen
-analyzed
-warlord
-hubbard
-heavenly
-behave
-slovenian
-##hit
-##ony
-hailed
-filmmakers
-trance
-caldwell
-skye
-unrest
-coward
-likelihood
-##aging
-bern
-sci
-taliban
-honolulu
-propose
-##wang
-1700
-browser
-imagining
-cobra
-contributes
-dukes
-instinctively
-conan
-violinist
-##ores
-accessories
-gradual
-##amp
-quotes
-sioux
-##dating
-undertake
-intercepted
-sparkling
-compressed
-139
-fungus
-tombs
-haley
-imposing
-rests
-degradation
-lincolnshire
-retailers
-wetlands
-tulsa
-distributor
-dungeon
-nun
-greenhouse
-convey
-atlantis
-aft
-exits
-oman
-dresser
-lyons
-##sti
-joking
-eddy
-judgement
-omitted
-digits
-##cts
-##game
-juniors
-##rae
-cents
-stricken
-une
-##ngo
-wizards
-weir
-breton
-nan
-technician
-fibers
-liking
-royalty
-##cca
-154
-persia
-terribly
-magician
-##rable
-##unt
-vance
-cafeteria
-booker
-camille
-warmer
-##static
-consume
-cavern
-gaps
-compass
-contemporaries
-foyer
-soothing
-graveyard
-maj
-plunged
-blush
-##wear
-cascade
-demonstrates
-ordinance
-##nov
-boyle
-##lana
-rockefeller
-shaken
-banjo
-izzy
-##ense
-breathless
-vines
-##32
-##eman
-alterations
-chromosome
-dwellings
-feudal
-mole
-153
-catalonia
-relics
-tenant
-mandated
-##fm
-fridge
-hats
-honesty
-patented
-raul
-heap
-cruisers
-accusing
-enlightenment
-infants
-wherein
-chatham
-contractors
-zen
-affinity
-hc
-osborne
-piston
-156
-traps
-maturity
-##rana
-lagos
-##zal
-peering
-##nay
-attendant
-dealers
-protocols
-subset
-prospects
-biographical
-##cre
-artery
-##zers
-insignia
-nuns
-endured
-##eration
-recommend
-schwartz
-serbs
-berger
-cromwell
-crossroads
-##ctor
-enduring
-clasped
-grounded
-##bine
-marseille
-twitched
-abel
-choke
-https
-catalyst
-moldova
-italians
-##tist
-disastrous
-wee
-##oured
-##nti
-wwf
-nope
-##piration
-##asa
-expresses
-thumbs
-167
-##nza
-coca
-1781
-cheating
-##ption
-skipped
-sensory
-heidelberg
-spies
-satan
-dangers
-semifinal
-202
-bohemia
-whitish
-confusing
-shipbuilding
-relies
-surgeons
-landings
-ravi
-baku
-moor
-suffix
-alejandro
-##yana
-litre
-upheld
-##unk
-rajasthan
-##rek
-coaster
-insists
-posture
-scenarios
-etienne
-favoured
-appoint
-transgender
-elephants
-poked
-greenwood
-defences
-fulfilled
-militant
-somali
-1758
-chalk
-potent
-##ucci
-migrants
-wink
-assistants
-nos
-restriction
-activism
-niger
-##ario
-colon
-shaun
-##sat
-daphne
-##erated
-swam
-congregations
-reprise
-considerations
-magnet
-playable
-xvi
-##р
-overthrow
-tobias
-knob
-chavez
-coding
-##mers
-propped
-katrina
-orient
-newcomer
-##suke
-temperate
-##pool
-farmhouse
-interrogation
-##vd
-committing
-##vert
-forthcoming
-strawberry
-joaquin
-macau
-ponds
-shocking
-siberia
-##cellular
-chant
-contributors
-##nant
-##ologists
-sped
-absorb
-hail
-1782
-spared
-##hore
-barbados
-karate
-opus
-originates
-saul
-##xie
-evergreen
-leaped
-##rock
-correlation
-exaggerated
-weekday
-unification
-bump
-tracing
-brig
-afb
-pathways
-utilizing
-##ners
-mod
-mb
-disturbance
-kneeling
-##stad
-##guchi
-100th
-pune
-##thy
-decreasing
-168
-manipulation
-miriam
-academia
-ecosystem
-occupational
-rbi
-##lem
-rift
-##14
-rotary
-stacked
-incorporation
-awakening
-generators
-guerrero
-racist
-##omy
-cyber
-derivatives
-culminated
-allie
-annals
-panzer
-sainte
-wikipedia
-pops
-zu
-austro
-##vate
-algerian
-politely
-nicholson
-mornings
-educate
-tastes
-thrill
-dartmouth
-##gating
-db
-##jee
-regan
-differing
-concentrating
-choreography
-divinity
-##media
-pledged
-alexandre
-routing
-gregor
-madeline
-##idal
-apocalypse
-##hora
-gunfire
-culminating
-elves
-fined
-liang
-lam
-programmed
-tar
-guessing
-transparency
-gabrielle
-##gna
-cancellation
-flexibility
-##lining
-accession
-shea
-stronghold
-nets
-specializes
-##rgan
-abused
-hasan
-sgt
-ling
-exceeding
-##₄
-admiration
-supermarket
-##ark
-photographers
-specialised
-tilt
-resonance
-hmm
-perfume
-380
-sami
-threatens
-garland
-botany
-guarding
-boiled
-greet
-puppy
-russo
-supplier
-wilmington
-vibrant
-vijay
-##bius
-paralympic
-grumbled
-paige
-faa
-licking
-margins
-hurricanes
-##gong
-fest
-grenade
-ripping
-##uz
-counseling
-weigh
-##sian
-needles
-wiltshire
-edison
-costly
-##not
-fulton
-tramway
-redesigned
-staffordshire
-cache
-gasping
-watkins
-sleepy
-candidacy
-##group
-monkeys
-timeline
-throbbing
-##bid
-##sos
-berth
-uzbekistan
-vanderbilt
-bothering
-overturned
-ballots
-gem
-##iger
-sunglasses
-subscribers
-hooker
-compelling
-ang
-exceptionally
-saloon
-stab
-##rdi
-carla
-terrifying
-rom
-##vision
-coil
-##oids
-satisfying
-vendors
-31st
-mackay
-deities
-overlooked
-ambient
-bahamas
-felipe
-olympia
-whirled
-botanist
-advertised
-tugging
-##dden
-disciples
-morales
-unionist
-rites
-foley
-morse
-motives
-creepy
-##₀
-soo
-##sz
-bargain
-highness
-frightening
-turnpike
-tory
-reorganization
-##cer
-depict
-biographer
-##walk
-unopposed
-manifesto
-##gles
-institut
-emile
-accidental
-kapoor
-##dam
-kilkenny
-cortex
-lively
-##13
-romanesque
-jain
-shan
-cannons
-##ood
-##ske
-petrol
-echoing
-amalgamated
-disappears
-cautious
-proposes
-sanctions
-trenton
-##ر
-flotilla
-aus
-contempt
-tor
-canary
-cote
-theirs
-##hun
-conceptual
-deleted
-fascinating
-paso
-blazing
-elf
-honourable
-hutchinson
-##eiro
-##outh
-##zin
-surveyor
-tee
-amidst
-wooded
-reissue
-intro
-##ono
-cobb
-shelters
-newsletter
-hanson
-brace
-encoding
-confiscated
-dem
-caravan
-marino
-scroll
-melodic
-cows
-imam
-##adi
-##aneous
-northward
-searches
-biodiversity
-cora
-310
-roaring
-##bers
-connell
-theologian
-halo
-compose
-pathetic
-unmarried
-dynamo
-##oot
-az
-calculation
-toulouse
-deserves
-humour
-nr
-forgiveness
-tam
-undergone
-martyr
-pamela
-myths
-whore
-counselor
-hicks
-290
-heavens
-battleship
-electromagnetic
-##bbs
-stellar
-establishments
-presley
-hopped
-##chin
-temptation
-90s
-wills
-nas
-##yuan
-nhs
-##nya
-seminars
-##yev
-adaptations
-gong
-asher
-lex
-indicator
-sikh
-tobago
-cites
-goin
-##yte
-satirical
-##gies
-characterised
-correspond
-bubbles
-lure
-participates
-##vid
-eruption
-skate
-therapeutic
-1785
-canals
-wholesale
-defaulted
-sac
-460
-petit
-##zzled
-virgil
-leak
-ravens
-256
-portraying
-##yx
-ghetto
-creators
-dams
-portray
-vicente
-##rington
-fae
-namesake
-bounty
-##arium
-joachim
-##ota
-##iser
-aforementioned
-axle
-snout
-depended
-dismantled
-reuben
-480
-##ibly
-gallagher
-##lau
-##pd
-earnest
-##ieu
-##iary
-inflicted
-objections
-##llar
-asa
-gritted
-##athy
-jericho
-##sea
-##was
-flick
-underside
-ceramics
-undead
-substituted
-195
-eastward
-undoubtedly
-wheeled
-chimney
-##iche
-guinness
-cb
-##ager
-siding
-##bell
-traitor
-baptiste
-disguised
-inauguration
-149
-tipperary
-choreographer
-perched
-warmed
-stationary
-eco
-##ike
-##ntes
-bacterial
-##aurus
-flores
-phosphate
-##core
-attacker
-invaders
-alvin
-intersects
-a1
-indirectly
-immigrated
-businessmen
-cornelius
-valves
-narrated
-pill
-sober
-ul
-nationale
-monastic
-applicants
-scenery
-##jack
-161
-motifs
-constitutes
-cpu
-##osh
-jurisdictions
-sd
-tuning
-irritation
-woven
-##uddin
-fertility
-gao
-##erie
-antagonist
-impatient
-glacial
-hides
-boarded
-denominations
-interception
-##jas
-cookie
-nicola
-##tee
-algebraic
-marquess
-bahn
-parole
-buyers
-bait
-turbines
-paperwork
-bestowed
-natasha
-renee
-oceans
-purchases
-157
-vaccine
-215
-##tock
-fixtures
-playhouse
-integrate
-jai
-oswald
-intellectuals
-##cky
-booked
-nests
-mortimer
-##isi
-obsession
-sept
-##gler
-##sum
-440
-scrutiny
-simultaneous
-squinted
-##shin
-collects
-oven
-shankar
-penned
-remarkably
-##я
-slips
-luggage
-spectral
-1786
-collaborations
-louie
-consolidation
-##ailed
-##ivating
-420
-hoover
-blackpool
-harness
-ignition
-vest
-tails
-belmont
-mongol
-skinner
-##nae
-visually
-mage
-derry
-##tism
-##unce
-stevie
-transitional
-##rdy
-redskins
-drying
-prep
-prospective
-##21
-annoyance
-oversee
-##loaded
-fills
-##books
-##iki
-announces
-fda
-scowled
-respects
-prasad
-mystic
-tucson
-##vale
-revue
-springer
-bankrupt
-1772
-aristotle
-salvatore
-habsburg
-##geny
-dal
-natal
-nut
-pod
-chewing
-darts
-moroccan
-walkover
-rosario
-lenin
-punjabi
-##ße
-grossed
-scattering
-wired
-invasive
-hui
-polynomial
-corridors
-wakes
-gina
-portrays
-##cratic
-arid
-retreating
-erich
-irwin
-sniper
-##dha
-linen
-lindsey
-maneuver
-butch
-shutting
-socio
-bounce
-commemorative
-postseason
-jeremiah
-pines
-275
-mystical
-beads
-bp
-abbas
-furnace
-bidding
-consulted
-assaulted
-empirical
-rubble
-enclosure
-sob
-weakly
-cancel
-polly
-yielded
-##emann
-curly
-prediction
-battered
-70s
-vhs
-jacqueline
-render
-sails
-barked
-detailing
-grayson
-riga
-sloane
-raging
-##yah
-herbs
-bravo
-##athlon
-alloy
-giggle
-imminent
-suffers
-assumptions
-waltz
-##itate
-accomplishments
-##ited
-bathing
-remixed
-deception
-prefix
-##emia
-deepest
-##tier
-##eis
-balkan
-frogs
-##rong
-slab
-##pate
-philosophers
-peterborough
-grains
-imports
-dickinson
-rwanda
-##atics
-1774
-dirk
-lan
-tablets
-##rove
-clone
-##rice
-caretaker
-hostilities
-mclean
-##gre
-regimental
-treasures
-norms
-impose
-tsar
-tango
-diplomacy
-variously
-complain
-192
-recognise
-arrests
-1779
-celestial
-pulitzer
-##dus
-bing
-libretto
-##moor
-adele
-splash
-##rite
-expectation
-lds
-confronts
-##izer
-spontaneous
-harmful
-wedge
-entrepreneurs
-buyer
-##ope
-bilingual
-translate
-rugged
-conner
-circulated
-uae
-eaton
-##gra
-##zzle
-lingered
-lockheed
-vishnu
-reelection
-alonso
-##oom
-joints
-yankee
-headline
-cooperate
-heinz
-laureate
-invading
-##sford
-echoes
-scandinavian
-##dham
-hugging
-vitamin
-salute
-micah
-hind
-trader
-##sper
-radioactive
-##ndra
-militants
-poisoned
-ratified
-remark
-campeonato
-deprived
-wander
-prop
-##dong
-outlook
-##tani
-##rix
-##eye
-chiang
-darcy
-##oping
-mandolin
-spice
-statesman
-babylon
-182
-walled
-forgetting
-afro
-##cap
-158
-giorgio
-buffer
-##polis
-planetary
-##gis
-overlap
-terminals
-kinda
-centenary
-##bir
-arising
-manipulate
-elm
-ke
-1770
-ak
-##tad
-chrysler
-mapped
-moose
-pomeranian
-quad
-macarthur
-assemblies
-shoreline
-recalls
-stratford
-##rted
-noticeable
-##evic
-imp
-##rita
-##sque
-accustomed
-supplying
-tents
-disgusted
-vogue
-sipped
-filters
-khz
-reno
-selecting
-luftwaffe
-mcmahon
-tyne
-masterpiece
-carriages
-collided
-dunes
-exercised
-flare
-remembers
-muzzle
-##mobile
-heck
-##rson
-burgess
-lunged
-middleton
-boycott
-bilateral
-##sity
-hazardous
-lumpur
-multiplayer
-spotlight
-jackets
-goldman
-liege
-porcelain
-rag
-waterford
-benz
-attracts
-hopeful
-battling
-ottomans
-kensington
-baked
-hymns
-cheyenne
-lattice
-levine
-borrow
-polymer
-clashes
-michaels
-monitored
-commitments
-denounced
-##25
-##von
-cavity
-##oney
-hobby
-akin
-##holders
-futures
-intricate
-cornish
-patty
-##oned
-illegally
-dolphin
-##lag
-barlow
-yellowish
-maddie
-apologized
-luton
-plagued
-##puram
-nana
-##rds
-sway
-fanny
-łodz
-##rino
-psi
-suspicions
-hanged
-##eding
-initiate
-charlton
-##por
-nak
-competent
-235
-analytical
-annex
-wardrobe
-reservations
-##rma
-sect
-162
-fairfax
-hedge
-piled
-buckingham
-uneven
-bauer
-simplicity
-snyder
-interpret
-accountability
-donors
-moderately
-byrd
-continents
-##cite
-##max
-disciple
-hr
-jamaican
-ping
-nominees
-##uss
-mongolian
-diver
-attackers
-eagerly
-ideological
-pillows
-miracles
-apartheid
-revolver
-sulfur
-clinics
-moran
-163
-##enko
-ile
-katy
-rhetoric
-##icated
-chronology
-recycling
-##hrer
-elongated
-mughal
-pascal
-profiles
-vibration
-databases
-domination
-##fare
-##rant
-matthias
-digest
-rehearsal
-polling
-weiss
-initiation
-reeves
-clinging
-flourished
-impress
-ngo
-##hoff
-##ume
-buckley
-symposium
-rhythms
-weed
-emphasize
-transforming
-##taking
-##gence
-##yman
-accountant
-analyze
-flicker
-foil
-priesthood
-voluntarily
-decreases
-##80
-##hya
-slater
-sv
-charting
-mcgill
-##lde
-moreno
-##iu
-besieged
-zur
-robes
-##phic
-admitting
-api
-deported
-turmoil
-peyton
-earthquakes
-##ares
-nationalists
-beau
-clair
-brethren
-interrupt
-welch
-curated
-galerie
-requesting
-164
-##ested
-impending
-steward
-viper
-##vina
-complaining
-beautifully
-brandy
-foam
-nl
-1660
-##cake
-alessandro
-punches
-laced
-explanations
-##lim
-attribute
-clit
-reggie
-discomfort
-##cards
-smoothed
-whales
-##cene
-adler
-countered
-duffy
-disciplinary
-widening
-recipe
-reliance
-conducts
-goats
-gradient
-preaching
-##shaw
-matilda
-quasi
-striped
-meridian
-cannabis
-cordoba
-certificates
-##agh
-##tering
-graffiti
-hangs
-pilgrims
-repeats
-##ych
-revive
-urine
-etat
-##hawk
-fueled
-belts
-fuzzy
-susceptible
-##hang
-mauritius
-salle
-sincere
-beers
-hooks
-##cki
-arbitration
-entrusted
-advise
-sniffed
-seminar
-junk
-donnell
-processors
-principality
-strapped
-celia
-mendoza
-everton
-fortunes
-prejudice
-starving
-reassigned
-steamer
-##lund
-tuck
-evenly
-foreman
-##ffen
-dans
-375
-envisioned
-slit
-##xy
-baseman
-liberia
-rosemary
-##weed
-electrified
-periodically
-potassium
-stride
-contexts
-sperm
-slade
-mariners
-influx
-bianca
-subcommittee
-##rane
-spilling
-icao
-estuary
-##nock
-delivers
-iphone
-##ulata
-isa
-mira
-bohemian
-dessert
-##sbury
-welcoming
-proudly
-slowing
-##chs
-musee
-ascension
-russ
-##vian
-waits
-##psy
-africans
-exploit
-##morphic
-gov
-eccentric
-crab
-peck
-##ull
-entrances
-formidable
-marketplace
-groom
-bolted
-metabolism
-patton
-robbins
-courier
-payload
-endure
-##ifier
-andes
-refrigerator
-##pr
-ornate
-##uca
-ruthless
-illegitimate
-masonry
-strasbourg
-bikes
-adobe
-##³
-apples
-quintet
-willingly
-niche
-bakery
-corpses
-energetic
-##cliffe
-##sser
-##ards
-177
-centimeters
-centro
-fuscous
-cretaceous
-rancho
-##yde
-andrei
-telecom
-tottenham
-oasis
-ordination
-vulnerability
-presiding
-corey
-cp
-penguins
-sims
-##pis
-malawi
-piss
-##48
-correction
-##cked
-##ffle
-##ryn
-countdown
-detectives
-psychiatrist
-psychedelic
-dinosaurs
-blouse
-##get
-choi
-vowed
-##oz
-randomly
-##pol
-49ers
-scrub
-blanche
-bruins
-dusseldorf
-##using
-unwanted
-##ums
-212
-dominique
-elevations
-headlights
-om
-laguna
-##oga
-1750
-famously
-ignorance
-shrewsbury
-##aine
-ajax
-breuning
-che
-confederacy
-greco
-overhaul
-##screen
-paz
-skirts
-disagreement
-cruelty
-jagged
-phoebe
-shifter
-hovered
-viruses
-##wes
-mandy
-##lined
-##gc
-landlord
-squirrel
-dashed
-##ι
-ornamental
-gag
-wally
-grange
-literal
-spurs
-undisclosed
-proceeding
-yin
-##text
-billie
-orphan
-spanned
-humidity
-indy
-weighted
-presentations
-explosions
-lucian
-##tary
-vaughn
-hindus
-##anga
-##hell
-psycho
-171
-daytona
-protects
-efficiently
-rematch
-sly
-tandem
-##oya
-rebranded
-impaired
-hee
-metropolis
-peach
-godfrey
-diaspora
-ethnicity
-prosperous
-gleaming
-dar
-grossing
-playback
-##rden
-stripe
-pistols
-##tain
-births
-labelled
-##cating
-172
-rudy
-alba
-##onne
-aquarium
-hostility
-##gb
-##tase
-shudder
-sumatra
-hardest
-lakers
-consonant
-creeping
-demos
-homicide
-capsule
-zeke
-liberties
-expulsion
-pueblo
-##comb
-trait
-transporting
-##ddin
-##neck
-##yna
-depart
-gregg
-mold
-ledge
-hangar
-oldham
-playboy
-termination
-analysts
-gmbh
-romero
-##itic
-insist
-cradle
-filthy
-brightness
-slash
-shootout
-deposed
-bordering
-##truct
-isis
-microwave
-tumbled
-sheltered
-cathy
-werewolves
-messy
-andersen
-convex
-clapped
-clinched
-satire
-wasting
-edo
-vc
-rufus
-##jak
-mont
-##etti
-poznan
-##keeping
-restructuring
-transverse
-##rland
-azerbaijani
-slovene
-gestures
-roommate
-choking
-shear
-##quist
-vanguard
-oblivious
-##hiro
-disagreed
-baptism
-##lich
-coliseum
-##aceae
-salvage
-societe
-cory
-locke
-relocation
-relying
-versailles
-ahl
-swelling
-##elo
-cheerful
-##word
-##edes
-gin
-sarajevo
-obstacle
-diverted
-##nac
-messed
-thoroughbred
-fluttered
-utrecht
-chewed
-acquaintance
-assassins
-dispatch
-mirza
-##wart
-nike
-salzburg
-swell
-yen
-##gee
-idle
-ligue
-samson
-##nds
-##igh
-playful
-spawned
-##cise
-tease
-##case
-burgundy
-##bot
-stirring
-skeptical
-interceptions
-marathi
-##dies
-bedrooms
-aroused
-pinch
-##lik
-preferences
-tattoos
-buster
-digitally
-projecting
-rust
-##ital
-kitten
-priorities
-addison
-pseudo
-##guard
-dusk
-icons
-sermon
-##psis
-##iba
-bt
-##lift
-##xt
-ju
-truce
-rink
-##dah
-##wy
-defects
-psychiatry
-offences
-calculate
-glucose
-##iful
-##rized
-##unda
-francaise
-##hari
-richest
-warwickshire
-carly
-1763
-purity
-redemption
-lending
-##cious
-muse
-bruises
-cerebral
-aero
-carving
-##name
-preface
-terminology
-invade
-monty
-##int
-anarchist
-blurred
-##iled
-rossi
-treats
-guts
-shu
-foothills
-ballads
-undertaking
-premise
-cecilia
-affiliates
-blasted
-conditional
-wilder
-minors
-drone
-rudolph
-buffy
-swallowing
-horton
-attested
-##hop
-rutherford
-howell
-primetime
-livery
-penal
-##bis
-minimize
-hydro
-wrecked
-wrought
-palazzo
-##gling
-cans
-vernacular
-friedman
-nobleman
-shale
-walnut
-danielle
-##ection
-##tley
-sears
-##kumar
-chords
-lend
-flipping
-streamed
-por
-dracula
-gallons
-sacrifices
-gamble
-orphanage
-##iman
-mckenzie
-##gible
-boxers
-daly
-##balls
-##ان
-208
-##ific
-##rative
-##iq
-exploited
-slated
-##uity
-circling
-hillary
-pinched
-goldberg
-provost
-campaigning
-lim
-piles
-ironically
-jong
-mohan
-successors
-usaf
-##tem
-##ught
-autobiographical
-haute
-preserves
-##ending
-acquitted
-comparisons
-203
-hydroelectric
-gangs
-cypriot
-torpedoes
-rushes
-chrome
-derive
-bumps
-instability
-fiat
-pets
-##mbe
-silas
-dye
-reckless
-settler
-##itation
-info
-heats
-##writing
-176
-canonical
-maltese
-fins
-mushroom
-stacy
-aspen
-avid
-##kur
-##loading
-vickers
-gaston
-hillside
-statutes
-wilde
-gail
-kung
-sabine
-comfortably
-motorcycles
-##rgo
-169
-pneumonia
-fetch
-##sonic
-axel
-faintly
-parallels
-##oop
-mclaren
-spouse
-compton
-interdisciplinary
-miner
-##eni
-181
-clamped
-##chal
-##llah
-separates
-versa
-##mler
-scarborough
-labrador
-##lity
-##osing
-rutgers
-hurdles
-como
-166
-burt
-divers
-##100
-wichita
-cade
-coincided
-##erson
-bruised
-mla
-##pper
-vineyard
-##ili
-##brush
-notch
-mentioning
-jase
-hearted
-kits
-doe
-##acle
-pomerania
-##ady
-ronan
-seizure
-pavel
-problematic
-##zaki
-domenico
-##ulin
-catering
-penelope
-dependence
-parental
-emilio
-ministerial
-atkinson
-##bolic
-clarkson
-chargers
-colby
-grill
-peeked
-arises
-summon
-##aged
-fools
-##grapher
-faculties
-qaeda
-##vial
-garner
-refurbished
-##hwa
-geelong
-disasters
-nudged
-bs
-shareholder
-lori
-algae
-reinstated
-rot
-##ades
-##nous
-invites
-stainless
-183
-inclusive
-##itude
-diocesan
-til
-##icz
-denomination
-##xa
-benton
-floral
-registers
-##ider
-##erman
-##kell
-absurd
-brunei
-guangzhou
-hitter
-retaliation
-##uled
-##eve
-blanc
-nh
-consistency
-contamination
-##eres
-##rner
-dire
-palermo
-broadcasters
-diaries
-inspire
-vols
-brewer
-tightening
-ky
-mixtape
-hormone
-##tok
-stokes
-##color
-##dly
-##ssi
-pg
-##ometer
-##lington
-sanitation
-##tility
-intercontinental
-apps
-##adt
-¹⁄₂
-cylinders
-economies
-favourable
-unison
-croix
-gertrude
-odyssey
-vanity
-dangling
-##logists
-upgrades
-dice
-middleweight
-practitioner
-##ight
-206
-henrik
-parlor
-orion
-angered
-lac
-python
-blurted
-##rri
-sensual
-intends
-swings
-angled
-##phs
-husky
-attain
-peerage
-precinct
-textiles
-cheltenham
-shuffled
-dai
-confess
-tasting
-bhutan
-##riation
-tyrone
-segregation
-abrupt
-ruiz
-##rish
-smirked
-blackwell
-confidential
-browning
-amounted
-##put
-vase
-scarce
-fabulous
-raided
-staple
-guyana
-unemployed
-glider
-shay
-##tow
-carmine
-troll
-intervene
-squash
-superstar
-##uce
-cylindrical
-len
-roadway
-researched
-handy
-##rium
-##jana
-meta
-lao
-declares
-##rring
-##tadt
-##elin
-##kova
-willem
-shrubs
-napoleonic
-realms
-skater
-qi
-volkswagen
-##ł
-tad
-hara
-archaeologist
-awkwardly
-eerie
-##kind
-wiley
-##heimer
-##24
-titus
-organizers
-cfl
-crusaders
-lama
-usb
-vent
-enraged
-thankful
-occupants
-maximilian
-##gaard
-possessing
-textbooks
-##oran
-collaborator
-quaker
-##ulo
-avalanche
-mono
-silky
-straits
-isaiah
-mustang
-surged
-resolutions
-potomac
-descend
-cl
-kilograms
-plato
-strains
-saturdays
-##olin
-bernstein
-##ype
-holstein
-ponytail
-##watch
-belize
-conversely
-heroine
-perpetual
-##ylus
-charcoal
-piedmont
-glee
-negotiating
-backdrop
-prologue
-##jah
-##mmy
-pasadena
-climbs
-ramos
-sunni
-##holm
-##tner
-##tri
-anand
-deficiency
-hertfordshire
-stout
-##avi
-aperture
-orioles
-##irs
-doncaster
-intrigued
-bombed
-coating
-otis
-##mat
-cocktail
-##jit
-##eto
-amir
-arousal
-sar
-##proof
-##act
-##ories
-dixie
-pots
-##bow
-whereabouts
-159
-##fted
-drains
-bullying
-cottages
-scripture
-coherent
-fore
-poe
-appetite
-##uration
-sampled
-##ators
-##dp
-derrick
-rotor
-jays
-peacock
-installment
-##rro
-advisors
-##coming
-rodeo
-scotch
-##mot
-##db
-##fen
-##vant
-ensued
-rodrigo
-dictatorship
-martyrs
-twenties
-##н
-towed
-incidence
-marta
-rainforest
-sai
-scaled
-##cles
-oceanic
-qualifiers
-symphonic
-mcbride
-dislike
-generalized
-aubrey
-colonization
-##iation
-##lion
-##ssing
-disliked
-lublin
-salesman
-##ulates
-spherical
-whatsoever
-sweating
-avalon
-contention
-punt
-severity
-alderman
-atari
-##dina
-##grant
-##rop
-scarf
-seville
-vertices
-annexation
-fairfield
-fascination
-inspiring
-launches
-palatinate
-regretted
-##rca
-feral
-##iom
-elk
-nap
-olsen
-reddy
-yong
-##leader
-##iae
-garment
-transports
-feng
-gracie
-outrage
-viceroy
-insides
-##esis
-breakup
-grady
-organizer
-softer
-grimaced
-222
-murals
-galicia
-arranging
-vectors
-##rsten
-bas
-##sb
-##cens
-sloan
-##eka
-bitten
-ara
-fender
-nausea
-bumped
-kris
-banquet
-comrades
-detector
-persisted
-##llan
-adjustment
-endowed
-cinemas
-##shot
-sellers
-##uman
-peek
-epa
-kindly
-neglect
-simpsons
-talon
-mausoleum
-runaway
-hangul
-lookout
-##cic
-rewards
-coughed
-acquainted
-chloride
-##ald
-quicker
-accordion
-neolithic
-##qa
-artemis
-coefficient
-lenny
-pandora
-tx
-##xed
-ecstasy
-litter
-segunda
-chairperson
-gemma
-hiss
-rumor
-vow
-nasal
-antioch
-compensate
-patiently
-transformers
-##eded
-judo
-morrow
-penis
-posthumous
-philips
-bandits
-husbands
-denote
-flaming
-##any
-##phones
-langley
-yorker
-1760
-walters
-##uo
-##kle
-gubernatorial
-fatty
-samsung
-leroy
-outlaw
-##nine
-unpublished
-poole
-jakob
-##ᵢ
-##ₙ
-crete
-distorted
-superiority
-##dhi
-intercept
-crust
-mig
-claus
-crashes
-positioning
-188
-stallion
-301
-frontal
-armistice
-##estinal
-elton
-aj
-encompassing
-camel
-commemorated
-malaria
-woodward
-calf
-cigar
-penetrate
-##oso
-willard
-##rno
-##uche
-illustrate
-amusing
-convergence
-noteworthy
-##lma
-##rva
-journeys
-realise
-manfred
-##sable
-410
-##vocation
-hearings
-fiance
-##posed
-educators
-provoked
-adjusting
-##cturing
-modular
-stockton
-paterson
-vlad
-rejects
-electors
-selena
-maureen
-##tres
-uber
-##rce
-swirled
-##num
-proportions
-nanny
-pawn
-naturalist
-parma
-apostles
-awoke
-ethel
-wen
-##bey
-monsoon
-overview
-##inating
-mccain
-rendition
-risky
-adorned
-##ih
-equestrian
-germain
-nj
-conspicuous
-confirming
-##yoshi
-shivering
-##imeter
-milestone
-rumours
-flinched
-bounds
-smacked
-token
-##bei
-lectured
-automobiles
-##shore
-impacted
-##iable
-nouns
-nero
-##leaf
-ismail
-prostitute
-trams
-##lace
-bridget
-sud
-stimulus
-impressions
-reins
-revolves
-##oud
-##gned
-giro
-honeymoon
-##swell
-criterion
-##sms
-##uil
-libyan
-prefers
-##osition
-211
-preview
-sucks
-accusation
-bursts
-metaphor
-diffusion
-tolerate
-faye
-betting
-cinematographer
-liturgical
-specials
-bitterly
-humboldt
-##ckle
-flux
-rattled
-##itzer
-archaeologists
-odor
-authorised
-marshes
-discretion
-##ов
-alarmed
-archaic
-inverse
-##leton
-explorers
-##pine
-drummond
-tsunami
-woodlands
-##minate
-##tland
-booklet
-insanity
-owning
-insert
-crafted
-calculus
-##tore
-receivers
-##bt
-stung
-##eca
-##nched
-prevailing
-travellers
-eyeing
-lila
-graphs
-##borne
-178
-julien
-##won
-morale
-adaptive
-therapist
-erica
-cw
-libertarian
-bowman
-pitches
-vita
-##ional
-crook
-##ads
-##entation
-caledonia
-mutiny
-##sible
-1840s
-automation
-##ß
-flock
-##pia
-ironic
-pathology
-##imus
-remarried
-##22
-joker
-withstand
-energies
-##att
-shropshire
-hostages
-madeleine
-tentatively
-conflicting
-mateo
-recipes
-euros
-ol
-mercenaries
-nico
-##ndon
-albuquerque
-augmented
-mythical
-bel
-freud
-##child
-cough
-##lica
-365
-freddy
-lillian
-genetically
-nuremberg
-calder
-209
-bonn
-outdoors
-paste
-suns
-urgency
-vin
-restraint
-tyson
-##cera
-##selle
-barrage
-bethlehem
-kahn
-##par
-mounts
-nippon
-barony
-happier
-ryu
-makeshift
-sheldon
-blushed
-castillo
-barking
-listener
-taped
-bethel
-fluent
-headlines
-pornography
-rum
-disclosure
-sighing
-mace
-doubling
-gunther
-manly
-##plex
-rt
-interventions
-physiological
-forwards
-emerges
-##tooth
-##gny
-compliment
-rib
-recession
-visibly
-barge
-faults
-connector
-exquisite
-prefect
-##rlin
-patio
-##cured
-elevators
-brandt
-italics
-pena
-173
-wasp
-satin
-ea
-botswana
-graceful
-respectable
-##jima
-##rter
-##oic
-franciscan
-generates
-##dl
-alfredo
-disgusting
-##olate
-##iously
-sherwood
-warns
-cod
-promo
-cheryl
-sino
-##ة
-##escu
-twitch
-##zhi
-brownish
-thom
-ortiz
-##dron
-densely
-##beat
-carmel
-reinforce
-##bana
-187
-anastasia
-downhill
-vertex
-contaminated
-remembrance
-harmonic
-homework
-##sol
-fiancee
-gears
-olds
-angelica
-loft
-ramsay
-quiz
-colliery
-sevens
-##cape
-autism
-##hil
-walkway
-##boats
-ruben
-abnormal
-ounce
-khmer
-##bbe
-zachary
-bedside
-morphology
-punching
-##olar
-sparrow
-convinces
-##35
-hewitt
-queer
-remastered
-rods
-mabel
-solemn
-notified
-lyricist
-symmetric
-##xide
-174
-encore
-passports
-wildcats
-##uni
-baja
-##pac
-mildly
-##ease
-bleed
-commodity
-mounds
-glossy
-orchestras
-##omo
-damian
-prelude
-ambitions
-##vet
-awhile
-remotely
-##aud
-asserts
-imply
-##iques
-distinctly
-modelling
-remedy
-##dded
-windshield
-dani
-xiao
-##endra
-audible
-powerplant
-1300
-invalid
-elemental
-acquisitions
-##hala
-immaculate
-libby
-plata
-smuggling
-ventilation
-denoted
-minh
-##morphism
-430
-differed
-dion
-kelley
-lore
-mocking
-sabbath
-spikes
-hygiene
-drown
-runoff
-stylized
-tally
-liberated
-aux
-interpreter
-righteous
-aba
-siren
-reaper
-pearce
-millie
-##cier
-##yra
-gaius
-##iso
-captures
-##ttering
-dorm
-claudio
-##sic
-benches
-knighted
-blackness
-##ored
-discount
-fumble
-oxidation
-routed
-##ς
-novak
-perpendicular
-spoiled
-fracture
-splits
-##urt
-pads
-topology
-##cats
-axes
-fortunate
-offenders
-protestants
-esteem
-221
-broadband
-convened
-frankly
-hound
-prototypes
-isil
-facilitated
-keel
-##sher
-sahara
-awaited
-bubba
-orb
-prosecutors
-186
-hem
-520
-##xing
-relaxing
-remnant
-romney
-sorted
-slalom
-stefano
-ulrich
-##active
-exemption
-folder
-pauses
-foliage
-hitchcock
-epithet
-204
-criticisms
-##aca
-ballistic
-brody
-hinduism
-chaotic
-youths
-equals
-##pala
-pts
-thicker
-analogous
-capitalist
-improvised
-overseeing
-sinatra
-ascended
-beverage
-##tl
-straightforward
-##kon
-curran
-##west
-bois
-325
-induce
-surveying
-emperors
-sax
-unpopular
-##kk
-cartoonist
-fused
-##mble
-unto
-##yuki
-localities
-##cko
-##ln
-darlington
-slain
-academie
-lobbying
-sediment
-puzzles
-##grass
-defiance
-dickens
-manifest
-tongues
-alumnus
-arbor
-coincide
-184
-appalachian
-mustafa
-examiner
-cabaret
-traumatic
-yves
-bracelet
-draining
-heroin
-magnum
-baths
-odessa
-consonants
-mitsubishi
-##gua
-kellan
-vaudeville
-##fr
-joked
-null
-straps
-probation
-##ław
-ceded
-interfaces
-##pas
-##zawa
-blinding
-viet
-224
-rothschild
-museo
-640
-huddersfield
-##vr
-tactic
-##storm
-brackets
-dazed
-incorrectly
-##vu
-reg
-glazed
-fearful
-manifold
-benefited
-irony
-##sun
-stumbling
-##rte
-willingness
-balkans
-mei
-wraps
-##aba
-injected
-##lea
-gu
-syed
-harmless
-##hammer
-bray
-takeoff
-poppy
-timor
-cardboard
-astronaut
-purdue
-weeping
-southbound
-cursing
-stalls
-diagonal
-##neer
-lamar
-bryce
-comte
-weekdays
-harrington
-##uba
-negatively
-##see
-lays
-grouping
-##cken
-##henko
-affirmed
-halle
-modernist
-##lai
-hodges
-smelling
-aristocratic
-baptized
-dismiss
-justification
-oilers
-##now
-coupling
-qin
-snack
-healer
-##qing
-gardener
-layla
-battled
-formulated
-stephenson
-gravitational
-##gill
-##jun
-1768
-granny
-coordinating
-suites
-##cd
-##ioned
-monarchs
-##cote
-##hips
-sep
-blended
-apr
-barrister
-deposition
-fia
-mina
-policemen
-paranoid
-##pressed
-churchyard
-covert
-crumpled
-creep
-abandoning
-tr
-transmit
-conceal
-barr
-understands
-readiness
-spire
-##cology
-##enia
-##erry
-610
-startling
-unlock
-vida
-bowled
-slots
-##nat
-##islav
-spaced
-trusting
-admire
-rig
-##ink
-slack
-##70
-mv
-207
-casualty
-##wei
-classmates
-##odes
-##rar
-##rked
-amherst
-furnished
-evolve
-foundry
-menace
-mead
-##lein
-flu
-wesleyan
-##kled
-monterey
-webber
-##vos
-wil
-##mith
-##на
-bartholomew
-justices
-restrained
-##cke
-amenities
-191
-mediated
-sewage
-trenches
-ml
-mainz
-##thus
-1800s
-##cula
-##inski
-caine
-bonding
-213
-converts
-spheres
-superseded
-marianne
-crypt
-sweaty
-ensign
-historia
-##br
-spruce
-##post
-##ask
-forks
-thoughtfully
-yukon
-pamphlet
-ames
-##uter
-karma
-##yya
-bryn
-negotiation
-sighs
-incapable
-##mbre
-##ntial
-actresses
-taft
-##mill
-luce
-prevailed
-##amine
-1773
-motionless
-envoy
-testify
-investing
-sculpted
-instructors
-provence
-kali
-cullen
-horseback
-##while
-goodwin
-##jos
-gaa
-norte
-##ldon
-modify
-wavelength
-abd
-214
-skinned
-sprinter
-forecast
-scheduling
-marries
-squared
-tentative
-##chman
-boer
-##isch
-bolts
-swap
-fisherman
-assyrian
-impatiently
-guthrie
-martins
-murdoch
-194
-tanya
-nicely
-dolly
-lacy
-med
-##45
-syn
-decks
-fashionable
-millionaire
-##ust
-surfing
-##ml
-##ision
-heaved
-tammy
-consulate
-attendees
-routinely
-197
-fuse
-saxophonist
-backseat
-malaya
-##lord
-scowl
-tau
-##ishly
-193
-sighted
-steaming
-##rks
-303
-911
-##holes
-##hong
-ching
-##wife
-bless
-conserved
-jurassic
-stacey
-unix
-zion
-chunk
-rigorous
-blaine
-198
-peabody
-slayer
-dismay
-brewers
-nz
-##jer
-det
-##glia
-glover
-postwar
-int
-penetration
-sylvester
-imitation
-vertically
-airlift
-heiress
-knoxville
-viva
-##uin
-390
-macon
-##rim
-##fighter
-##gonal
-janice
-##orescence
-##wari
-marius
-belongings
-leicestershire
-196
-blanco
-inverted
-preseason
-sanity
-sobbing
-##due
-##elt
-##dled
-collingwood
-regeneration
-flickering
-shortest
-##mount
-##osi
-feminism
-##lat
-sherlock
-cabinets
-fumbled
-northbound
-precedent
-snaps
-##mme
-researching
-##akes
-guillaume
-insights
-manipulated
-vapor
-neighbour
-sap
-gangster
-frey
-f1
-stalking
-scarcely
-callie
-barnett
-tendencies
-audi
-doomed
-assessing
-slung
-panchayat
-ambiguous
-bartlett
-##etto
-distributing
-violating
-wolverhampton
-##hetic
-swami
-histoire
-##urus
-liable
-pounder
-groin
-hussain
-larsen
-popping
-surprises
-##atter
-vie
-curt
-##station
-mute
-relocate
-musicals
-authorization
-richter
-##sef
-immortality
-tna
-bombings
-##press
-deteriorated
-yiddish
-##acious
-robbed
-colchester
-cs
-pmid
-ao
-verified
-balancing
-apostle
-swayed
-recognizable
-oxfordshire
-retention
-nottinghamshire
-contender
-judd
-invitational
-shrimp
-uhf
-##icient
-cleaner
-longitudinal
-tanker
-##mur
-acronym
-broker
-koppen
-sundance
-suppliers
-##gil
-4000
-clipped
-fuels
-petite
-##anne
-landslide
-helene
-diversion
-populous
-landowners
-auspices
-melville
-quantitative
-##xes
-ferries
-nicky
-##llus
-doo
-haunting
-roche
-carver
-downed
-unavailable
-##pathy
-approximation
-hiroshima
-##hue
-garfield
-valle
-comparatively
-keyboardist
-traveler
-##eit
-congestion
-calculating
-subsidiaries
-##bate
-serb
-modernization
-fairies
-deepened
-ville
-averages
-##lore
-inflammatory
-tonga
-##itch
-co₂
-squads
-##hea
-gigantic
-serum
-enjoyment
-retailer
-verona
-35th
-cis
-##phobic
-magna
-technicians
-##vati
-arithmetic
-##sport
-levin
-##dation
-amtrak
-chow
-sienna
-##eyer
-backstage
-entrepreneurship
-##otic
-learnt
-tao
-##udy
-worcestershire
-formulation
-baggage
-hesitant
-bali
-sabotage
-##kari
-barren
-enhancing
-murmur
-pl
-freshly
-putnam
-syntax
-aces
-medicines
-resentment
-bandwidth
-##sier
-grins
-chili
-guido
-##sei
-framing
-implying
-gareth
-lissa
-genevieve
-pertaining
-admissions
-geo
-thorpe
-proliferation
-sato
-bela
-analyzing
-parting
-##gor
-awakened
-##isman
-huddled
-secrecy
-##kling
-hush
-gentry
-540
-dungeons
-##ego
-coasts
-##utz
-sacrificed
-##chule
-landowner
-mutually
-prevalence
-programmer
-adolescent
-disrupted
-seaside
-gee
-trusts
-vamp
-georgie
-##nesian
-##iol
-schedules
-sindh
-##market
-etched
-hm
-sparse
-bey
-beaux
-scratching
-gliding
-unidentified
-216
-collaborating
-gems
-jesuits
-oro
-accumulation
-shaping
-mbe
-anal
-##xin
-231
-enthusiasts
-newscast
-##egan
-janata
-dewey
-parkinson
-179
-ankara
-biennial
-towering
-dd
-inconsistent
-950
-##chet
-thriving
-terminate
-cabins
-furiously
-eats
-advocating
-donkey
-marley
-muster
-phyllis
-leiden
-##user
-grassland
-glittering
-iucn
-loneliness
-217
-memorandum
-armenians
-##ddle
-popularized
-rhodesia
-60s
-lame
-##illon
-sans
-bikini
-header
-orbits
-##xx
-##finger
-##ulator
-sharif
-spines
-biotechnology
-strolled
-naughty
-yates
-##wire
-fremantle
-milo
-##mour
-abducted
-removes
-##atin
-humming
-wonderland
-##chrome
-##ester
-hume
-pivotal
-##rates
-armand
-grams
-believers
-elector
-rte
-apron
-bis
-scraped
-##yria
-endorsement
-initials
-##llation
-eps
-dotted
-hints
-buzzing
-emigration
-nearer
-##tom
-indicators
-##ulu
-coarse
-neutron
-protectorate
-##uze
-directional
-exploits
-pains
-loire
-1830s
-proponents
-guggenheim
-rabbits
-ritchie
-305
-hectare
-inputs
-hutton
-##raz
-verify
-##ako
-boilers
-longitude
-##lev
-skeletal
-yer
-emilia
-citrus
-compromised
-##gau
-pokemon
-prescription
-paragraph
-eduard
-cadillac
-attire
-categorized
-kenyan
-weddings
-charley
-##bourg
-entertain
-monmouth
-##lles
-nutrients
-davey
-mesh
-incentive
-practised
-ecosystems
-kemp
-subdued
-overheard
-##rya
-bodily
-maxim
-##nius
-apprenticeship
-ursula
-##fight
-lodged
-rug
-silesian
-unconstitutional
-patel
-inspected
-coyote
-unbeaten
-##hak
-34th
-disruption
-convict
-parcel
-##cl
-##nham
-collier
-implicated
-mallory
-##iac
-##lab
-susannah
-winkler
-##rber
-shia
-phelps
-sediments
-graphical
-robotic
-##sner
-adulthood
-mart
-smoked
-##isto
-kathryn
-clarified
-##aran
-divides
-convictions
-oppression
-pausing
-burying
-##mt
-federico
-mathias
-eileen
-##tana
-kite
-hunched
-##acies
-189
-##atz
-disadvantage
-liza
-kinetic
-greedy
-paradox
-yokohama
-dowager
-trunks
-ventured
-##gement
-gupta
-vilnius
-olaf
-##thest
-crimean
-hopper
-##ej
-progressively
-arturo
-mouthed
-arrondissement
-##fusion
-rubin
-simulcast
-oceania
-##orum
-##stra
-##rred
-busiest
-intensely
-navigator
-cary
-##vine
-##hini
-##bies
-fife
-rowe
-rowland
-posing
-insurgents
-shafts
-lawsuits
-activate
-conor
-inward
-culturally
-garlic
-265
-##eering
-eclectic
-##hui
-##kee
-##nl
-furrowed
-vargas
-meteorological
-rendezvous
-##aus
-culinary
-commencement
-##dition
-quota
-##notes
-mommy
-salaries
-overlapping
-mule
-##iology
-##mology
-sums
-wentworth
-##isk
-##zione
-mainline
-subgroup
-##illy
-hack
-plaintiff
-verdi
-bulb
-differentiation
-engagements
-multinational
-supplemented
-bertrand
-caller
-regis
-##naire
-##sler
-##arts
-##imated
-blossom
-propagation
-kilometer
-viaduct
-vineyards
-##uate
-beckett
-optimization
-golfer
-songwriters
-seminal
-semitic
-thud
-volatile
-evolving
-ridley
-##wley
-trivial
-distributions
-scandinavia
-jiang
-##ject
-wrestled
-insistence
-##dio
-emphasizes
-napkin
-##ods
-adjunct
-rhyme
-##ricted
-##eti
-hopeless
-surrounds
-tremble
-32nd
-smoky
-##ntly
-oils
-medicinal
-padded
-steer
-wilkes
-219
-255
-concessions
-hue
-uniquely
-blinded
-landon
-yahoo
-##lane
-hendrix
-commemorating
-dex
-specify
-chicks
-##ggio
-intercity
-1400
-morley
-##torm
-highlighting
-##oting
-pang
-oblique
-stalled
-##liner
-flirting
-newborn
-1769
-bishopric
-shaved
-232
-currie
-##ush
-dharma
-spartan
-##ooped
-favorites
-smug
-novella
-sirens
-abusive
-creations
-espana
-##lage
-paradigm
-semiconductor
-sheen
-##rdo
-##yen
-##zak
-nrl
-renew
-##pose
-##tur
-adjutant
-marches
-norma
-##enity
-ineffective
-weimar
-grunt
-##gat
-lordship
-plotting
-expenditure
-infringement
-lbs
-refrain
-av
-mimi
-mistakenly
-postmaster
-1771
-##bara
-ras
-motorsports
-tito
-199
-subjective
-##zza
-bully
-stew
-##kaya
-prescott
-1a
-##raphic
-##zam
-bids
-styling
-paranormal
-reeve
-sneaking
-exploding
-katz
-akbar
-migrant
-syllables
-indefinitely
-##ogical
-destroys
-replaces
-applause
-##phine
-pest
-##fide
-218
-articulated
-bertie
-##thing
-##cars
-##ptic
-courtroom
-crowley
-aesthetics
-cummings
-tehsil
-hormones
-titanic
-dangerously
-##ibe
-stadion
-jaenelle
-auguste
-ciudad
-##chu
-mysore
-partisans
-##sio
-lucan
-philipp
-##aly
-debating
-henley
-interiors
-##rano
-##tious
-homecoming
-beyonce
-usher
-henrietta
-prepares
-weeds
-##oman
-ely
-plucked
-##pire
-##dable
-luxurious
-##aq
-artifact
-password
-pasture
-juno
-maddy
-minsk
-##dder
-##ologies
-##rone
-assessments
-martian
-royalist
-1765
-examines
-##mani
-##rge
-nino
-223
-parry
-scooped
-relativity
-##eli
-##uting
-##cao
-congregational
-noisy
-traverse
-##agawa
-strikeouts
-nickelodeon
-obituary
-transylvania
-binds
-depictions
-polk
-trolley
-##yed
-##lard
-breeders
-##under
-dryly
-hokkaido
-1762
-strengths
-stacks
-bonaparte
-connectivity
-neared
-prostitutes
-stamped
-anaheim
-gutierrez
-sinai
-##zzling
-bram
-fresno
-madhya
-##86
-proton
-##lena
-##llum
-##phon
-reelected
-wanda
-##anus
-##lb
-ample
-distinguishing
-##yler
-grasping
-sermons
-tomato
-bland
-stimulation
-avenues
-##eux
-spreads
-scarlett
-fern
-pentagon
-assert
-baird
-chesapeake
-ir
-calmed
-distortion
-fatalities
-##olis
-correctional
-pricing
-##astic
-##gina
-prom
-dammit
-ying
-collaborate
-##chia
-welterweight
-33rd
-pointer
-substitution
-bonded
-umpire
-communicating
-multitude
-paddle
-##obe
-federally
-intimacy
-##insky
-betray
-ssr
-##lett
-##lean
-##lves
-##therapy
-airbus
-##tery
-functioned
-ud
-bearer
-biomedical
-netflix
-##hire
-##nca
-condom
-brink
-ik
-##nical
-macy
-##bet
-flap
-gma
-experimented
-jelly
-lavender
-##icles
-##ulia
-munro
-##mian
-##tial
-rye
-##rle
-60th
-gigs
-hottest
-rotated
-predictions
-fuji
-bu
-##erence
-##omi
-barangay
-##fulness
-##sas
-clocks
-##rwood
-##liness
-cereal
-roe
-wight
-decker
-uttered
-babu
-onion
-xml
-forcibly
-##df
-petra
-sarcasm
-hartley
-peeled
-storytelling
-##42
-##xley
-##ysis
-##ffa
-fibre
-kiel
-auditor
-fig
-harald
-greenville
-##berries
-geographically
-nell
-quartz
-##athic
-cemeteries
-##lr
-crossings
-nah
-holloway
-reptiles
-chun
-sichuan
-snowy
-660
-corrections
-##ivo
-zheng
-ambassadors
-blacksmith
-fielded
-fluids
-hardcover
-turnover
-medications
-melvin
-academies
-##erton
-ro
-roach
-absorbing
-spaniards
-colton
-##founded
-outsider
-espionage
-kelsey
-245
-edible
-##ulf
-dora
-establishes
-##sham
-##tries
-contracting
-##tania
-cinematic
-costello
-nesting
-##uron
-connolly
-duff
-##nology
-mma
-##mata
-fergus
-sexes
-gi
-optics
-spectator
-woodstock
-banning
-##hee
-##fle
-differentiate
-outfielder
-refinery
-226
-312
-gerhard
-horde
-lair
-drastically
-##udi
-landfall
-##cheng
-motorsport
-odi
-##achi
-predominant
-quay
-skins
-##ental
-edna
-harshly
-complementary
-murdering
-##aves
-wreckage
-##90
-ono
-outstretched
-lennox
-munitions
-galen
-reconcile
-470
-scalp
-bicycles
-gillespie
-questionable
-rosenberg
-guillermo
-hostel
-jarvis
-kabul
-volvo
-opium
-yd
-##twined
-abuses
-decca
-outpost
-##cino
-sensible
-neutrality
-##64
-ponce
-anchorage
-atkins
-turrets
-inadvertently
-disagree
-libre
-vodka
-reassuring
-weighs
-##yal
-glide
-jumper
-ceilings
-repertory
-outs
-stain
-##bial
-envy
-##ucible
-smashing
-heightened
-policing
-hyun
-mixes
-lai
-prima
-##ples
-celeste
-##bina
-lucrative
-intervened
-kc
-manually
-##rned
-stature
-staffed
-bun
-bastards
-nairobi
-priced
-##auer
-thatcher
-##kia
-tripped
-comune
-##ogan
-##pled
-brasil
-incentives
-emanuel
-hereford
-musica
-##kim
-benedictine
-biennale
-##lani
-eureka
-gardiner
-rb
-knocks
-sha
-##ael
-##elled
-##onate
-efficacy
-ventura
-masonic
-sanford
-maize
-leverage
-##feit
-capacities
-santana
-##aur
-novelty
-vanilla
-##cter
-##tour
-benin
-##oir
-##rain
-neptune
-drafting
-tallinn
-##cable
-humiliation
-##boarding
-schleswig
-fabian
-bernardo
-liturgy
-spectacle
-sweeney
-pont
-routledge
-##tment
-cosmos
-ut
-hilt
-sleek
-universally
-##eville
-##gawa
-typed
-##dry
-favors
-allegheny
-glaciers
-##rly
-recalling
-aziz
-##log
-parasite
-requiem
-auf
-##berto
-##llin
-illumination
-##breaker
-##issa
-festivities
-bows
-govern
-vibe
-vp
-333
-sprawled
-larson
-pilgrim
-bwf
-leaping
-##rts
-##ssel
-alexei
-greyhound
-hoarse
-##dler
-##oration
-seneca
-##cule
-gaping
-##ulously
-##pura
-cinnamon
-##gens
-##rricular
-craven
-fantasies
-houghton
-engined
-reigned
-dictator
-supervising
-##oris
-bogota
-commentaries
-unnatural
-fingernails
-spirituality
-tighten
-##tm
-canadiens
-protesting
-intentional
-cheers
-sparta
-##ytic
-##iere
-##zine
-widen
-belgarath
-controllers
-dodd
-iaaf
-navarre
-##ication
-defect
-squire
-steiner
-whisky
-##mins
-560
-inevitably
-tome
-##gold
-chew
-##uid
-##lid
-elastic
-##aby
-streaked
-alliances
-jailed
-regal
-##ined
-##phy
-czechoslovak
-narration
-absently
-##uld
-bluegrass
-guangdong
-quran
-criticizing
-hose
-hari
-##liest
-##owa
-skier
-streaks
-deploy
-##lom
-raft
-bose
-dialed
-huff
-##eira
-haifa
-simplest
-bursting
-endings
-ib
-sultanate
-##titled
-franks
-whitman
-ensures
-sven
-##ggs
-collaborators
-forster
-organising
-ui
-banished
-napier
-injustice
-teller
-layered
-thump
-##otti
-roc
-battleships
-evidenced
-fugitive
-sadie
-robotics
-##roud
-equatorial
-geologist
-##iza
-yielding
-##bron
-##sr
-internationale
-mecca
-##diment
-sbs
-skyline
-toad
-uploaded
-reflective
-undrafted
-lal
-leafs
-bayern
-##dai
-lakshmi
-shortlisted
-##stick
-##wicz
-camouflage
-donate
-af
-christi
-lau
-##acio
-disclosed
-nemesis
-1761
-assemble
-straining
-northamptonshire
-tal
-##asi
-bernardino
-premature
-heidi
-42nd
-coefficients
-galactic
-reproduce
-buzzed
-sensations
-zionist
-monsieur
-myrtle
-##eme
-archery
-strangled
-musically
-viewpoint
-antiquities
-bei
-trailers
-seahawks
-cured
-pee
-preferring
-tasmanian
-lange
-sul
-##mail
-##working
-colder
-overland
-lucivar
-massey
-gatherings
-haitian
-##smith
-disapproval
-flaws
-##cco
-##enbach
-1766
-npr
-##icular
-boroughs
-creole
-forums
-techno
-1755
-dent
-abdominal
-streetcar
-##eson
-##stream
-procurement
-gemini
-predictable
-##tya
-acheron
-christoph
-feeder
-fronts
-vendor
-bernhard
-jammu
-tumors
-slang
-##uber
-goaltender
-twists
-curving
-manson
-vuelta
-mer
-peanut
-confessions
-pouch
-unpredictable
-allowance
-theodor
-vascular
-##factory
-bala
-authenticity
-metabolic
-coughing
-nanjing
-##cea
-pembroke
-##bard
-splendid
-36th
-ff
-hourly
-##ahu
-elmer
-handel
-##ivate
-awarding
-thrusting
-dl
-experimentation
-##hesion
-##46
-caressed
-entertained
-steak
-##rangle
-biologist
-orphans
-baroness
-oyster
-stepfather
-##dridge
-mirage
-reefs
-speeding
-##31
-barons
-1764
-227
-inhabit
-preached
-repealed
-##tral
-honoring
-boogie
-captives
-administer
-johanna
-##imate
-gel
-suspiciously
-1767
-sobs
-##dington
-backbone
-hayward
-garry
-##folding
-##nesia
-maxi
-##oof
-##ppe
-ellison
-galileo
-##stand
-crimea
-frenzy
-amour
-bumper
-matrices
-natalia
-baking
-garth
-palestinians
-##grove
-smack
-conveyed
-ensembles
-gardening
-##manship
-##rup
-##stituting
-1640
-harvesting
-topography
-jing
-shifters
-dormitory
-##carriage
-##lston
-ist
-skulls
-##stadt
-dolores
-jewellery
-sarawak
-##wai
-##zier
-fences
-christy
-confinement
-tumbling
-credibility
-fir
-stench
-##bria
-##plication
-##nged
-##sam
-virtues
-##belt
-marjorie
-pba
-##eem
-##made
-celebrates
-schooner
-agitated
-barley
-fulfilling
-anthropologist
-##pro
-restrict
-novi
-regulating
-##nent
-padres
-##rani
-##hesive
-loyola
-tabitha
-milky
-olson
-proprietor
-crambidae
-guarantees
-intercollegiate
-ljubljana
-hilda
-##sko
-ignorant
-hooded
-##lts
-sardinia
-##lidae
-##vation
-frontman
-privileged
-witchcraft
-##gp
-jammed
-laude
-poking
-##than
-bracket
-amazement
-yunnan
-##erus
-maharaja
-linnaeus
-264
-commissioning
-milano
-peacefully
-##logies
-akira
-rani
-regulator
-##36
-grasses
-##rance
-luzon
-crows
-compiler
-gretchen
-seaman
-edouard
-tab
-buccaneers
-ellington
-hamlets
-whig
-socialists
-##anto
-directorial
-easton
-mythological
-##kr
-##vary
-rhineland
-semantic
-taut
-dune
-inventions
-succeeds
-##iter
-replication
-branched
-##pired
-jul
-prosecuted
-kangaroo
-penetrated
-##avian
-middlesbrough
-doses
-bleak
-madam
-predatory
-relentless
-##vili
-reluctance
-##vir
-hailey
-crore
-silvery
-1759
-monstrous
-swimmers
-transmissions
-hawthorn
-informing
-##eral
-toilets
-caracas
-crouch
-kb
-##sett
-295
-cartel
-hadley
-##aling
-alexia
-yvonne
-##biology
-cinderella
-eton
-superb
-blizzard
-stabbing
-industrialist
-maximus
-##gm
-##orus
-groves
-maud
-clade
-oversized
-comedic
-##bella
-rosen
-nomadic
-fulham
-montane
-beverages
-galaxies
-redundant
-swarm
-##rot
-##folia
-##llis
-buckinghamshire
-fen
-bearings
-bahadur
-##rom
-gilles
-phased
-dynamite
-faber
-benoit
-vip
-##ount
-##wd
-booking
-fractured
-tailored
-anya
-spices
-westwood
-cairns
-auditions
-inflammation
-steamed
-##rocity
-##acion
-##urne
-skyla
-thereof
-watford
-torment
-archdeacon
-transforms
-lulu
-demeanor
-fucked
-serge
-##sor
-mckenna
-minas
-entertainer
-##icide
-caress
-originate
-residue
-##sty
-1740
-##ilised
-##org
-beech
-##wana
-subsidies
-##ghton
-emptied
-gladstone
-ru
-firefighters
-voodoo
-##rcle
-het
-nightingale
-tamara
-edmond
-ingredient
-weaknesses
-silhouette
-285
-compatibility
-withdrawing
-hampson
-##mona
-anguish
-giggling
-##mber
-bookstore
-##jiang
-southernmost
-tilting
-##vance
-bai
-economical
-rf
-briefcase
-dreadful
-hinted
-projections
-shattering
-totaling
-##rogate
-analogue
-indicted
-periodical
-fullback
-##dman
-haynes
-##tenberg
-##ffs
-##ishment
-1745
-thirst
-stumble
-penang
-vigorous
-##ddling
-##kor
-##lium
-octave
-##ove
-##enstein
-##inen
-##ones
-siberian
-##uti
-cbn
-repeal
-swaying
-##vington
-khalid
-tanaka
-unicorn
-otago
-plastered
-lobe
-riddle
-##rella
-perch
-##ishing
-croydon
-filtered
-graeme
-tripoli
-##ossa
-crocodile
-##chers
-sufi
-mined
-##tung
-inferno
-lsu
-##phi
-swelled
-utilizes
-£2
-cale
-periodicals
-styx
-hike
-informally
-coop
-lund
-##tidae
-ala
-hen
-qui
-transformations
-disposed
-sheath
-chickens
-##cade
-fitzroy
-sas
-silesia
-unacceptable
-odisha
-1650
-sabrina
-pe
-spokane
-ratios
-athena
-massage
-shen
-dilemma
-##drum
-##riz
-##hul
-corona
-doubtful
-niall
-##pha
-##bino
-fines
-cite
-acknowledging
-bangor
-ballard
-bathurst
-##resh
-huron
-mustered
-alzheimer
-garments
-kinase
-tyre
-warship
-##cp
-flashback
-pulmonary
-braun
-cheat
-kamal
-cyclists
-constructions
-grenades
-ndp
-traveller
-excuses
-stomped
-signalling
-trimmed
-futsal
-mosques
-relevance
-##wine
-wta
-##23
-##vah
-##lter
-hoc
-##riding
-optimistic
-##´s
-deco
-sim
-interacting
-rejecting
-moniker
-waterways
-##ieri
-##oku
-mayors
-gdansk
-outnumbered
-pearls
-##ended
-##hampton
-fairs
-totals
-dominating
-262
-notions
-stairway
-compiling
-pursed
-commodities
-grease
-yeast
-##jong
-carthage
-griffiths
-residual
-amc
-contraction
-laird
-sapphire
-##marine
-##ivated
-amalgamation
-dissolve
-inclination
-lyle
-packaged
-altitudes
-suez
-canons
-graded
-lurched
-narrowing
-boasts
-guise
-wed
-enrico
-##ovsky
-rower
-scarred
-bree
-cub
-iberian
-protagonists
-bargaining
-proposing
-trainers
-voyages
-vans
-fishes
-##aea
-##ivist
-##verance
-encryption
-artworks
-kazan
-sabre
-cleopatra
-hepburn
-rotting
-supremacy
-mecklenburg
-##brate
-burrows
-hazards
-outgoing
-flair
-organizes
-##ctions
-scorpion
-##usions
-boo
-234
-chevalier
-dunedin
-slapping
-##34
-ineligible
-pensions
-##38
-##omic
-manufactures
-emails
-bismarck
-238
-weakening
-blackish
-ding
-mcgee
-quo
-##rling
-northernmost
-xx
-manpower
-greed
-sampson
-clicking
-##ange
-##horpe
-##inations
-##roving
-torre
-##eptive
-##moral
-symbolism
-38th
-asshole
-meritorious
-outfits
-splashed
-biographies
-sprung
-astros
-##tale
-302
-737
-filly
-raoul
-nw
-tokugawa
-linden
-clubhouse
-##apa
-tracts
-romano
-##pio
-putin
-tags
-##note
-chained
-dickson
-gunshot
-moe
-gunn
-rashid
-##tails
-zipper
-##bas
-##nea
-contrasted
-##ply
-##udes
-plum
-pharaoh
-##pile
-aw
-comedies
-ingrid
-sandwiches
-subdivisions
-1100
-mariana
-nokia
-kamen
-hz
-delaney
-veto
-herring
-##words
-possessive
-outlines
-##roup
-siemens
-stairwell
-rc
-gallantry
-messiah
-palais
-yells
-233
-zeppelin
-##dm
-bolivar
-##cede
-smackdown
-mckinley
-##mora
-##yt
-muted
-geologic
-finely
-unitary
-avatar
-hamas
-maynard
-rees
-bog
-contrasting
-##rut
-liv
-chico
-disposition
-pixel
-##erate
-becca
-dmitry
-yeshiva
-narratives
-##lva
-##ulton
-mercenary
-sharpe
-tempered
-navigate
-stealth
-amassed
-keynes
-##lini
-untouched
-##rrie
-havoc
-lithium
-##fighting
-abyss
-graf
-southward
-wolverine
-balloons
-implements
-ngos
-transitions
-##icum
-ambushed
-concacaf
-dormant
-economists
-##dim
-costing
-csi
-rana
-universite
-boulders
-verity
-##llon
-collin
-mellon
-misses
-cypress
-fluorescent
-lifeless
-spence
-##ulla
-crewe
-shepard
-pak
-revelations
-##م
-jolly
-gibbons
-paw
-##dro
-##quel
-freeing
-##test
-shack
-fries
-palatine
-##51
-##hiko
-accompaniment
-cruising
-recycled
-##aver
-erwin
-sorting
-synthesizers
-dyke
-realities
-sg
-strides
-enslaved
-wetland
-##ghan
-competence
-gunpowder
-grassy
-maroon
-reactors
-objection
-##oms
-carlson
-gearbox
-macintosh
-radios
-shelton
-##sho
-clergyman
-prakash
-254
-mongols
-trophies
-oricon
-228
-stimuli
-twenty20
-cantonese
-cortes
-mirrored
-##saurus
-bhp
-cristina
-melancholy
-##lating
-enjoyable
-nuevo
-##wny
-downfall
-schumacher
-##ind
-banging
-lausanne
-rumbled
-paramilitary
-reflex
-ax
-amplitude
-migratory
-##gall
-##ups
-midi
-barnard
-lastly
-sherry
-##hp
-##nall
-keystone
-##kra
-carleton
-slippery
-##53
-coloring
-foe
-socket
-otter
-##rgos
-mats
-##tose
-consultants
-bafta
-bison
-topping
-##km
-490
-primal
-abandonment
-transplant
-atoll
-hideous
-mort
-pained
-reproduced
-tae
-howling
-##turn
-unlawful
-billionaire
-hotter
-poised
-lansing
-##chang
-dinamo
-retro
-messing
-nfc
-domesday
-##mina
-blitz
-timed
-##athing
-##kley
-ascending
-gesturing
-##izations
-signaled
-tis
-chinatown
-mermaid
-savanna
-jameson
-##aint
-catalina
-##pet
-##hers
-cochrane
-cy
-chatting
-##kus
-alerted
-computation
-mused
-noelle
-majestic
-mohawk
-campo
-octagonal
-##sant
-##hend
-241
-aspiring
-##mart
-comprehend
-iona
-paralyzed
-shimmering
-swindon
-rhone
-##eley
-reputed
-configurations
-pitchfork
-agitation
-francais
-gillian
-lipstick
-##ilo
-outsiders
-pontifical
-resisting
-bitterness
-sewer
-rockies
-##edd
-##ucher
-misleading
-1756
-exiting
-galloway
-##nging
-risked
-##heart
-246
-commemoration
-schultz
-##rka
-integrating
-##rsa
-poses
-shrieked
-##weiler
-guineas
-gladys
-jerking
-owls
-goldsmith
-nightly
-penetrating
-##unced
-lia
-##33
-ignited
-betsy
-##aring
-##thorpe
-follower
-vigorously
-##rave
-coded
-kiran
-knit
-zoology
-tbilisi
-##28
-##bered
-repository
-govt
-deciduous
-dino
-growling
-##bba
-enhancement
-unleashed
-chanting
-pussy
-biochemistry
-##eric
-kettle
-repression
-toxicity
-nrhp
-##arth
-##kko
-##bush
-ernesto
-commended
-outspoken
-242
-mca
-parchment
-sms
-kristen
-##aton
-bisexual
-raked
-glamour
-navajo
-a2
-conditioned
-showcased
-##hma
-spacious
-youthful
-##esa
-usl
-appliances
-junta
-brest
-layne
-conglomerate
-enchanted
-chao
-loosened
-picasso
-circulating
-inspect
-montevideo
-##centric
-##kti
-piazza
-spurred
-##aith
-bari
-freedoms
-poultry
-stamford
-lieu
-##ect
-indigo
-sarcastic
-bahia
-stump
-attach
-dvds
-frankenstein
-lille
-approx
-scriptures
-pollen
-##script
-nmi
-overseen
-##ivism
-tides
-proponent
-newmarket
-inherit
-milling
-##erland
-centralized
-##rou
-distributors
-credentials
-drawers
-abbreviation
-##lco
-##xon
-downing
-uncomfortably
-ripe
-##oes
-erase
-franchises
-##ever
-populace
-##bery
-##khar
-decomposition
-pleas
-##tet
-daryl
-sabah
-##stle
-##wide
-fearless
-genie
-lesions
-annette
-##ogist
-oboe
-appendix
-nair
-dripped
-petitioned
-maclean
-mosquito
-parrot
-rpg
-hampered
-1648
-operatic
-reservoirs
-##tham
-irrelevant
-jolt
-summarized
-##fp
-medallion
-##taff
-##−
-clawed
-harlow
-narrower
-goddard
-marcia
-bodied
-fremont
-suarez
-altering
-tempest
-mussolini
-porn
-##isms
-sweetly
-oversees
-walkers
-solitude
-grimly
-shrines
-hk
-ich
-supervisors
-hostess
-dietrich
-legitimacy
-brushes
-expressive
-##yp
-dissipated
-##rse
-localized
-systemic
-##nikov
-gettysburg
-##js
-##uaries
-dialogues
-muttering
-251
-housekeeper
-sicilian
-discouraged
-##frey
-beamed
-kaladin
-halftime
-kidnap
-##amo
-##llet
-1754
-synonymous
-depleted
-instituto
-insulin
-reprised
-##opsis
-clashed
-##ctric
-interrupting
-radcliffe
-insisting
-medici
-1715
-ejected
-playfully
-turbulent
-##47
-starvation
-##rini
-shipment
-rebellious
-petersen
-verification
-merits
-##rified
-cakes
-##charged
-1757
-milford
-shortages
-spying
-fidelity
-##aker
-emitted
-storylines
-harvested
-seismic
-##iform
-cheung
-kilda
-theoretically
-barbie
-lynx
-##rgy
-##tius
-goblin
-mata
-poisonous
-##nburg
-reactive
-residues
-obedience
-##евич
-conjecture
-##rac
-401
-hating
-sixties
-kicker
-moaning
-motown
-##bha
-emancipation
-neoclassical
-##hering
-consoles
-ebert
-professorship
-##tures
-sustaining
-assaults
-obeyed
-affluent
-incurred
-tornadoes
-##eber
-##zow
-emphasizing
-highlanders
-cheated
-helmets
-##ctus
-internship
-terence
-bony
-executions
-legislators
-berries
-peninsular
-tinged
-##aco
-1689
-amplifier
-corvette
-ribbons
-lavish
-pennant
-##lander
-worthless
-##chfield
-##forms
-mariano
-pyrenees
-expenditures
-##icides
-chesterfield
-mandir
-tailor
-39th
-sergey
-nestled
-willed
-aristocracy
-devotees
-goodnight
-raaf
-rumored
-weaponry
-remy
-appropriations
-harcourt
-burr
-riaa
-##lence
-limitation
-unnoticed
-guo
-soaking
-swamps
-##tica
-collapsing
-tatiana
-descriptive
-brigham
-psalm
-##chment
-maddox
-##lization
-patti
-caliph
-##aja
-akron
-injuring
-serra
-##ganj
-basins
-##sari
-astonished
-launcher
-##church
-hilary
-wilkins
-sewing
-##sf
-stinging
-##fia
-##ncia
-underwood
-startup
-##ition
-compilations
-vibrations
-embankment
-jurist
-##nity
-bard
-juventus
-groundwater
-kern
-palaces
-helium
-boca
-cramped
-marissa
-soto
-##worm
-jae
-princely
-##ggy
-faso
-bazaar
-warmly
-##voking
-229
-pairing
-##lite
-##grate
-##nets
-wien
-freaked
-ulysses
-rebirth
-##alia
-##rent
-mummy
-guzman
-jimenez
-stilled
-##nitz
-trajectory
-tha
-woken
-archival
-professions
-##pts
-##pta
-hilly
-shadowy
-shrink
-##bolt
-norwood
-glued
-migrate
-stereotypes
-devoid
-##pheus
-625
-evacuate
-horrors
-infancy
-gotham
-knowles
-optic
-downloaded
-sachs
-kingsley
-parramatta
-darryl
-mor
-##onale
-shady
-commence
-confesses
-kan
-##meter
-##placed
-marlborough
-roundabout
-regents
-frigates
-io
-##imating
-gothenburg
-revoked
-carvings
-clockwise
-convertible
-intruder
-##sche
-banged
-##ogo
-vicky
-bourgeois
-##mony
-dupont
-footing
-##gum
-pd
-##real
-buckle
-yun
-penthouse
-sane
-720
-serviced
-stakeholders
-neumann
-bb
-##eers
-comb
-##gam
-catchment
-pinning
-rallies
-typing
-##elles
-forefront
-freiburg
-sweetie
-giacomo
-widowed
-goodwill
-worshipped
-aspirations
-midday
-##vat
-fishery
-##trick
-bournemouth
-turk
-243
-hearth
-ethanol
-guadalajara
-murmurs
-sl
-##uge
-afforded
-scripted
-##hta
-wah
-##jn
-coroner
-translucent
-252
-memorials
-puck
-progresses
-clumsy
-##race
-315
-candace
-recounted
-##27
-##slin
-##uve
-filtering
-##mac
-howl
-strata
-heron
-leveled
-##ays
-dubious
-##oja
-##т
-##wheel
-citations
-exhibiting
-##laya
-##mics
-##pods
-turkic
-##lberg
-injunction
-##ennial
-##mit
-antibodies
-##44
-organise
-##rigues
-cardiovascular
-cushion
-inverness
-##zquez
-dia
-cocoa
-sibling
-##tman
-##roid
-expanse
-feasible
-tunisian
-algiers
-##relli
-rus
-bloomberg
-dso
-westphalia
-bro
-tacoma
-281
-downloads
-##ours
-konrad
-duran
-##hdi
-continuum
-jett
-compares
-legislator
-secession
-##nable
-##gues
-##zuka
-translating
-reacher
-##gley
-##ła
-aleppo
-##agi
-tc
-orchards
-trapping
-linguist
-versatile
-drumming
-postage
-calhoun
-superiors
-##mx
-barefoot
-leary
-##cis
-ignacio
-alfa
-kaplan
-##rogen
-bratislava
-mori
-##vot
-disturb
-haas
-313
-cartridges
-gilmore
-radiated
-salford
-tunic
-hades
-##ulsive
-archeological
-delilah
-magistrates
-auditioned
-brewster
-charters
-empowerment
-blogs
-cappella
-dynasties
-iroquois
-whipping
-##krishna
-raceway
-truths
-myra
-weaken
-judah
-mcgregor
-##horse
-mic
-refueling
-37th
-burnley
-bosses
-markus
-premio
-query
-##gga
-dunbar
-##economic
-darkest
-lyndon
-sealing
-commendation
-reappeared
-##mun
-addicted
-ezio
-slaughtered
-satisfactory
-shuffle
-##eves
-##thic
-##uj
-fortification
-warrington
-##otto
-resurrected
-fargo
-mane
-##utable
-##lei
-##space
-foreword
-ox
-##aris
-##vern
-abrams
-hua
-##mento
-sakura
-##alo
-uv
-sentimental
-##skaya
-midfield
-##eses
-sturdy
-scrolls
-macleod
-##kyu
-entropy
-##lance
-mitochondrial
-cicero
-excelled
-thinner
-convoys
-perceive
-##oslav
-##urable
-systematically
-grind
-burkina
-287
-##tagram
-ops
-##aman
-guantanamo
-##cloth
-##tite
-forcefully
-wavy
-##jou
-pointless
-##linger
-##tze
-layton
-portico
-superficial
-clerical
-outlaws
-##hism
-burials
-muir
-##inn
-creditors
-hauling
-rattle
-##leg
-calais
-monde
-archers
-reclaimed
-dwell
-wexford
-hellenic
-falsely
-remorse
-##tek
-dough
-furnishings
-##uttered
-gabon
-neurological
-novice
-##igraphy
-contemplated
-pulpit
-nightstand
-saratoga
-##istan
-documenting
-pulsing
-taluk
-##firmed
-busted
-marital
-##rien
-disagreements
-wasps
-##yes
-hodge
-mcdonnell
-mimic
-fran
-pendant
-dhabi
-musa
-##nington
-congratulations
-argent
-darrell
-concussion
-losers
-regrets
-thessaloniki
-reversal
-donaldson
-hardwood
-thence
-achilles
-ritter
-##eran
-demonic
-jurgen
-prophets
-goethe
-eki
-classmate
-buff
-##cking
-yank
-irrational
-##inging
-perished
-seductive
-qur
-sourced
-##crat
-##typic
-mustard
-ravine
-barre
-horizontally
-characterization
-phylogenetic
-boise
-##dit
-##runner
-##tower
-brutally
-intercourse
-seduce
-##bbing
-fay
-ferris
-ogden
-amar
-nik
-unarmed
-##inator
-evaluating
-kyrgyzstan
-sweetness
-##lford
-##oki
-mccormick
-meiji
-notoriety
-stimulate
-disrupt
-figuring
-instructional
-mcgrath
-##zoo
-groundbreaking
-##lto
-flinch
-khorasan
-agrarian
-bengals
-mixer
-radiating
-##sov
-ingram
-pitchers
-nad
-tariff
-##cript
-tata
-##codes
-##emi
-##ungen
-appellate
-lehigh
-##bled
-##giri
-brawl
-duct
-texans
-##ciation
-##ropolis
-skipper
-speculative
-vomit
-doctrines
-stresses
-253
-davy
-graders
-whitehead
-jozef
-timely
-cumulative
-haryana
-paints
-appropriately
-boon
-cactus
-##ales
-##pid
-dow
-legions
-##pit
-perceptions
-1730
-picturesque
-##yse
-periphery
-rune
-wr
-##aha
-celtics
-sentencing
-whoa
-##erin
-confirms
-variance
-425
-moines
-mathews
-spade
-rave
-m1
-fronted
-fx
-blending
-alleging
-reared
-##gl
-237
-##paper
-grassroots
-eroded
-##free
-##physical
-directs
-ordeal
-##sław
-accelerate
-hacker
-rooftop
-##inia
-lev
-buys
-cebu
-devote
-##lce
-specialising
-##ulsion
-choreographed
-repetition
-warehouses
-##ryl
-paisley
-tuscany
-analogy
-sorcerer
-hash
-huts
-shards
-descends
-exclude
-nix
-chaplin
-gaga
-ito
-vane
-##drich
-causeway
-misconduct
-limo
-orchestrated
-glands
-jana
-##kot
-u2
-##mple
-##sons
-branching
-contrasts
-scoop
-longed
-##virus
-chattanooga
-##75
-syrup
-cornerstone
-##tized
-##mind
-##iaceae
-careless
-precedence
-frescoes
-##uet
-chilled
-consult
-modelled
-snatch
-peat
-##thermal
-caucasian
-humane
-relaxation
-spins
-temperance
-##lbert
-occupations
-lambda
-hybrids
-moons
-mp3
-##oese
-247
-rolf
-societal
-yerevan
-ness
-##ssler
-befriended
-mechanized
-nominate
-trough
-boasted
-cues
-seater
-##hom
-bends
-##tangle
-conductors
-emptiness
-##lmer
-eurasian
-adriatic
-tian
-##cie
-anxiously
-lark
-propellers
-chichester
-jock
-ev
-2a
-##holding
-credible
-recounts
-tori
-loyalist
-abduction
-##hoot
-##redo
-nepali
-##mite
-ventral
-tempting
-##ango
-##crats
-steered
-##wice
-javelin
-dipping
-laborers
-prentice
-looming
-titanium
-##ː
-badges
-emir
-tensor
-##ntation
-egyptians
-rash
-denies
-hawthorne
-lombard
-showers
-wehrmacht
-dietary
-trojan
-##reus
-welles
-executing
-horseshoe
-lifeboat
-##lak
-elsa
-infirmary
-nearing
-roberta
-boyer
-mutter
-trillion
-joanne
-##fine
-##oked
-sinks
-vortex
-uruguayan
-clasp
-sirius
-##block
-accelerator
-prohibit
-sunken
-byu
-chronological
-diplomats
-ochreous
-510
-symmetrical
-1644
-maia
-##tology
-salts
-reigns
-atrocities
-##ия
-hess
-bared
-issn
-##vyn
-cater
-saturated
-##cycle
-##isse
-sable
-voyager
-dyer
-yusuf
-##inge
-fountains
-wolff
-##39
-##nni
-engraving
-rollins
-atheist
-ominous
-##ault
-herr
-chariot
-martina
-strung
-##fell
-##farlane
-horrific
-sahib
-gazes
-saetan
-erased
-ptolemy
-##olic
-flushing
-lauderdale
-analytic
-##ices
-530
-navarro
-beak
-gorilla
-herrera
-broom
-guadalupe
-raiding
-sykes
-311
-bsc
-deliveries
-1720
-invasions
-carmichael
-tajikistan
-thematic
-ecumenical
-sentiments
-onstage
-##rians
-##brand
-##sume
-catastrophic
-flanks
-molten
-##arns
-waller
-aimee
-terminating
-##icing
-alternately
-##oche
-nehru
-printers
-outraged
-##eving
-empires
-template
-banners
-repetitive
-za
-##oise
-vegetarian
-##tell
-guiana
-opt
-cavendish
-lucknow
-synthesized
-##hani
-##mada
-finalized
-##ctable
-fictitious
-mayoral
-unreliable
-##enham
-embracing
-peppers
-rbis
-##chio
-##neo
-inhibition
-slashed
-togo
-orderly
-embroidered
-safari
-salty
-236
-barron
-benito
-totaled
-##dak
-pubs
-simulated
-caden
-devin
-tolkien
-momma
-welding
-sesame
-##ept
-gottingen
-hardness
-630
-shaman
-temeraire
-620
-adequately
-pediatric
-##kit
-ck
-assertion
-radicals
-composure
-cadence
-seafood
-beaufort
-lazarus
-mani
-warily
-cunning
-kurdistan
-249
-cantata
-##kir
-ares
-##41
-##clusive
-nape
-townland
-geared
-insulted
-flutter
-boating
-violate
-draper
-dumping
-malmo
-##hh
-##romatic
-firearm
-alta
-bono
-obscured
-##clave
-exceeds
-panorama
-unbelievable
-##train
-preschool
-##essed
-disconnected
-installing
-rescuing
-secretaries
-accessibility
-##castle
-##drive
-##ifice
-##film
-bouts
-slug
-waterway
-mindanao
-##buro
-##ratic
-halves
-##ل
-calming
-liter
-maternity
-adorable
-bragg
-electrification
-mcc
-##dote
-roxy
-schizophrenia
-##body
-munoz
-kaye
-whaling
-239
-mil
-tingling
-tolerant
-##ago
-unconventional
-volcanoes
-##finder
-deportivo
-##llie
-robson
-kaufman
-neuroscience
-wai
-deportation
-masovian
-scraping
-converse
-##bh
-hacking
-bulge
-##oun
-administratively
-yao
-580
-amp
-mammoth
-booster
-claremont
-hooper
-nomenclature
-pursuits
-mclaughlin
-melinda
-##sul
-catfish
-barclay
-substrates
-taxa
-zee
-originals
-kimberly
-packets
-padma
-##ality
-borrowing
-ostensibly
-solvent
-##bri
-##genesis
-##mist
-lukas
-shreveport
-veracruz
-##ь
-##lou
-##wives
-cheney
-tt
-anatolia
-hobbs
-##zyn
-cyclic
-radiant
-alistair
-greenish
-siena
-dat
-independents
-##bation
-conform
-pieter
-hyper
-applicant
-bradshaw
-spores
-telangana
-vinci
-inexpensive
-nuclei
-322
-jang
-nme
-soho
-spd
-##ign
-cradled
-receptionist
-pow
-##43
-##rika
-fascism
-##ifer
-experimenting
-##ading
-##iec
-##region
-345
-jocelyn
-maris
-stair
-nocturnal
-toro
-constabulary
-elgin
-##kker
-msc
-##giving
-##schen
-##rase
-doherty
-doping
-sarcastically
-batter
-maneuvers
-##cano
-##apple
-##gai
-##git
-intrinsic
-##nst
-##stor
-1753
-showtime
-cafes
-gasps
-lviv
-ushered
-##thed
-fours
-restart
-astonishment
-transmitting
-flyer
-shrugs
-##sau
-intriguing
-cones
-dictated
-mushrooms
-medial
-##kovsky
-##elman
-escorting
-gaped
-##26
-godfather
-##door
-##sell
-djs
-recaptured
-timetable
-vila
-1710
-3a
-aerodrome
-mortals
-scientology
-##orne
-angelina
-mag
-convection
-unpaid
-insertion
-intermittent
-lego
-##nated
-endeavor
-kota
-pereira
-##lz
-304
-bwv
-glamorgan
-insults
-agatha
-fey
-##cend
-fleetwood
-mahogany
-protruding
-steamship
-zeta
-##arty
-mcguire
-suspense
-##sphere
-advising
-urges
-##wala
-hurriedly
-meteor
-gilded
-inline
-arroyo
-stalker
-##oge
-excitedly
-revered
-##cure
-earle
-introductory
-##break
-##ilde
-mutants
-puff
-pulses
-reinforcement
-##haling
-curses
-lizards
-stalk
-correlated
-##fixed
-fallout
-macquarie
-##unas
-bearded
-denton
-heaving
-802
-##ocation
-winery
-assign
-dortmund
-##lkirk
-everest
-invariant
-charismatic
-susie
-##elling
-bled
-lesley
-telegram
-sumner
-bk
-##ogen
-##к
-wilcox
-needy
-colbert
-duval
-##iferous
-##mbled
-allotted
-attends
-imperative
-##hita
-replacements
-hawker
-##inda
-insurgency
-##zee
-##eke
-casts
-##yla
-680
-ives
-transitioned
-##pack
-##powering
-authoritative
-baylor
-flex
-cringed
-plaintiffs
-woodrow
-##skie
-drastic
-ape
-aroma
-unfolded
-commotion
-nt
-preoccupied
-theta
-routines
-lasers
-privatization
-wand
-domino
-ek
-clenching
-nsa
-strategically
-showered
-bile
-handkerchief
-pere
-storing
-christophe
-insulting
-316
-nakamura
-romani
-asiatic
-magdalena
-palma
-cruises
-stripping
-405
-konstantin
-soaring
-##berman
-colloquially
-forerunner
-havilland
-incarcerated
-parasites
-sincerity
-##utus
-disks
-plank
-saigon
-##ining
-corbin
-homo
-ornaments
-powerhouse
-##tlement
-chong
-fastened
-feasibility
-idf
-morphological
-usable
-##nish
-##zuki
-aqueduct
-jaguars
-keepers
-##flies
-aleksandr
-faust
-assigns
-ewing
-bacterium
-hurled
-tricky
-hungarians
-integers
-wallis
-321
-yamaha
-##isha
-hushed
-oblivion
-aviator
-evangelist
-friars
-##eller
-monograph
-ode
-##nary
-airplanes
-labourers
-charms
-##nee
-1661
-hagen
-tnt
-rudder
-fiesta
-transcript
-dorothea
-ska
-inhibitor
-maccabi
-retorted
-raining
-encompassed
-clauses
-menacing
-1642
-lineman
-##gist
-vamps
-##ape
-##dick
-gloom
-##rera
-dealings
-easing
-seekers
-##nut
-##pment
-helens
-unmanned
-##anu
-##isson
-basics
-##amy
-##ckman
-adjustments
-1688
-brutality
-horne
-##zell
-sui
-##55
-##mable
-aggregator
-##thal
-rhino
-##drick
-##vira
-counters
-zoom
-##01
-##rting
-mn
-montenegrin
-packard
-##unciation
-##♭
-##kki
-reclaim
-scholastic
-thugs
-pulsed
-##icia
-syriac
-quan
-saddam
-banda
-kobe
-blaming
-buddies
-dissent
-##lusion
-##usia
-corbett
-jaya
-delle
-erratic
-lexie
-##hesis
-435
-amiga
-hermes
-##pressing
-##leen
-chapels
-gospels
-jamal
-##uating
-compute
-revolving
-warp
-##sso
-##thes
-armory
-##eras
-##gol
-antrim
-loki
-##kow
-##asian
-##good
-##zano
-braid
-handwriting
-subdistrict
-funky
-pantheon
-##iculate
-concurrency
-estimation
-improper
-juliana
-##his
-newcomers
-johnstone
-staten
-communicated
-##oco
-##alle
-sausage
-stormy
-##stered
-##tters
-superfamily
-##grade
-acidic
-collateral
-tabloid
-##oped
-##rza
-bladder
-austen
-##ellant
-mcgraw
-##hay
-hannibal
-mein
-aquino
-lucifer
-wo
-badger
-boar
-cher
-christensen
-greenberg
-interruption
-##kken
-jem
-244
-mocked
-bottoms
-cambridgeshire
-##lide
-sprawling
-##bbly
-eastwood
-ghent
-synth
-##buck
-advisers
-##bah
-nominally
-hapoel
-qu
-daggers
-estranged
-fabricated
-towels
-vinnie
-wcw
-misunderstanding
-anglia
-nothin
-unmistakable
-##dust
-##lova
-chilly
-marquette
-truss
-##edge
-##erine
-reece
-##lty
-##chemist
-##connected
-272
-308
-41st
-bash
-raion
-waterfalls
-##ump
-##main
-labyrinth
-queue
-theorist
-##istle
-bharatiya
-flexed
-soundtracks
-rooney
-leftist
-patrolling
-wharton
-plainly
-alleviate
-eastman
-schuster
-topographic
-engages
-immensely
-unbearable
-fairchild
-1620
-dona
-lurking
-parisian
-oliveira
-ia
-indictment
-hahn
-bangladeshi
-##aster
-vivo
-##uming
-##ential
-antonia
-expects
-indoors
-kildare
-harlan
-##logue
-##ogenic
-##sities
-forgiven
-##wat
-childish
-tavi
-##mide
-##orra
-plausible
-grimm
-successively
-scooted
-##bola
-##dget
-##rith
-spartans
-emery
-flatly
-azure
-epilogue
-##wark
-flourish
-##iny
-##tracted
-##overs
-##oshi
-bestseller
-distressed
-receipt
-spitting
-hermit
-topological
-##cot
-drilled
-subunit
-francs
-##layer
-eel
-##fk
-##itas
-octopus
-footprint
-petitions
-ufo
-##say
-##foil
-interfering
-leaking
-palo
-##metry
-thistle
-valiant
-##pic
-narayan
-mcpherson
-##fast
-gonzales
-##ym
-##enne
-dustin
-novgorod
-solos
-##zman
-doin
-##raph
-##patient
-##meyer
-soluble
-ashland
-cuffs
-carole
-pendleton
-whistling
-vassal
-##river
-deviation
-revisited
-constituents
-rallied
-rotate
-loomed
-##eil
-##nting
-amateurs
-augsburg
-auschwitz
-crowns
-skeletons
-##cona
-bonnet
-257
-dummy
-globalization
-simeon
-sleeper
-mandal
-differentiated
-##crow
-##mare
-milne
-bundled
-exasperated
-talmud
-owes
-segregated
-##feng
-##uary
-dentist
-piracy
-props
-##rang
-devlin
-##torium
-malicious
-paws
-##laid
-dependency
-##ergy
-##fers
-##enna
-258
-pistons
-rourke
-jed
-grammatical
-tres
-maha
-wig
-512
-ghostly
-jayne
-##achal
-##creen
-##ilis
-##lins
-##rence
-designate
-##with
-arrogance
-cambodian
-clones
-showdown
-throttle
-twain
-##ception
-lobes
-metz
-nagoya
-335
-braking
-##furt
-385
-roaming
-##minster
-amin
-crippled
-##37
-##llary
-indifferent
-hoffmann
-idols
-intimidating
-1751
-261
-influenza
-memo
-onions
-1748
-bandage
-consciously
-##landa
-##rage
-clandestine
-observes
-swiped
-tangle
-##ener
-##jected
-##trum
-##bill
-##lta
-hugs
-congresses
-josiah
-spirited
-##dek
-humanist
-managerial
-filmmaking
-inmate
-rhymes
-debuting
-grimsby
-ur
-##laze
-duplicate
-vigor
-##tf
-republished
-bolshevik
-refurbishment
-antibiotics
-martini
-methane
-newscasts
-royale
-horizons
-levant
-iain
-visas
-##ischen
-paler
-##around
-manifestation
-snuck
-alf
-chop
-futile
-pedestal
-rehab
-##kat
-bmg
-kerman
-res
-fairbanks
-jarrett
-abstraction
-saharan
-##zek
-1746
-procedural
-clearer
-kincaid
-sash
-luciano
-##ffey
-crunch
-helmut
-##vara
-revolutionaries
-##tute
-creamy
-leach
-##mmon
-1747
-permitting
-nes
-plight
-wendell
-##lese
-contra
-ts
-clancy
-ipa
-mach
-staples
-autopsy
-disturbances
-nueva
-karin
-pontiac
-##uding
-proxy
-venerable
-haunt
-leto
-bergman
-expands
-##helm
-wal
-##pipe
-canning
-celine
-cords
-obesity
-##enary
-intrusion
-planner
-##phate
-reasoned
-sequencing
-307
-harrow
-##chon
-##dora
-marred
-mcintyre
-repay
-tarzan
-darting
-248
-harrisburg
-margarita
-repulsed
-##hur
-##lding
-belinda
-hamburger
-novo
-compliant
-runways
-bingham
-registrar
-skyscraper
-ic
-cuthbert
-improvisation
-livelihood
-##corp
-##elial
-admiring
-##dened
-sporadic
-believer
-casablanca
-popcorn
-##29
-asha
-shovel
-##bek
-##dice
-coiled
-tangible
-##dez
-casper
-elsie
-resin
-tenderness
-rectory
-##ivision
-avail
-sonar
-##mori
-boutique
-##dier
-guerre
-bathed
-upbringing
-vaulted
-sandals
-blessings
-##naut
-##utnant
-1680
-306
-foxes
-pia
-corrosion
-hesitantly
-confederates
-crystalline
-footprints
-shapiro
-tirana
-valentin
-drones
-45th
-microscope
-shipments
-texted
-inquisition
-wry
-guernsey
-unauthorized
-resigning
-760
-ripple
-schubert
-stu
-reassure
-felony
-##ardo
-brittle
-koreans
-##havan
-##ives
-dun
-implicit
-tyres
-##aldi
-##lth
-magnolia
-##ehan
-##puri
-##poulos
-aggressively
-fei
-gr
-familiarity
-##poo
-indicative
-##trust
-fundamentally
-jimmie
-overrun
-395
-anchors
-moans
-##opus
-britannia
-armagh
-##ggle
-purposely
-seizing
-##vao
-bewildered
-mundane
-avoidance
-cosmopolitan
-geometridae
-quartermaster
-caf
-415
-chatter
-engulfed
-gleam
-purge
-##icate
-juliette
-jurisprudence
-guerra
-revisions
-##bn
-casimir
-brew
-##jm
-1749
-clapton
-cloudy
-conde
-hermitage
-278
-simulations
-torches
-vincenzo
-matteo
-##rill
-hidalgo
-booming
-westbound
-accomplishment
-tentacles
-unaffected
-##sius
-annabelle
-flopped
-sloping
-##litz
-dreamer
-interceptor
-vu
-##loh
-consecration
-copying
-messaging
-breaker
-climates
-hospitalized
-1752
-torino
-afternoons
-winfield
-witnessing
-##teacher
-breakers
-choirs
-sawmill
-coldly
-##ege
-sipping
-haste
-uninhabited
-conical
-bibliography
-pamphlets
-severn
-edict
-##oca
-deux
-illnesses
-grips
-##pl
-rehearsals
-sis
-thinkers
-tame
-##keepers
-1690
-acacia
-reformer
-##osed
-##rys
-shuffling
-##iring
-##shima
-eastbound
-ionic
-rhea
-flees
-littered
-##oum
-rocker
-vomiting
-groaning
-champ
-overwhelmingly
-civilizations
-paces
-sloop
-adoptive
-##tish
-skaters
-##vres
-aiding
-mango
-##joy
-nikola
-shriek
-##ignon
-pharmaceuticals
-##mg
-tuna
-calvert
-gustavo
-stocked
-yearbook
-##urai
-##mana
-computed
-subsp
-riff
-hanoi
-kelvin
-hamid
-moors
-pastures
-summons
-jihad
-nectar
-##ctors
-bayou
-untitled
-pleasing
-vastly
-republics
-intellect
-##η
-##ulio
-##tou
-crumbling
-stylistic
-sb
-##ی
-consolation
-frequented
-h₂o
-walden
-widows
-##iens
-404
-##ignment
-chunks
-improves
-288
-grit
-recited
-##dev
-snarl
-sociological
-##arte
-##gul
-inquired
-##held
-bruise
-clube
-consultancy
-homogeneous
-hornets
-multiplication
-pasta
-prick
-savior
-##grin
-##kou
-##phile
-yoon
-##gara
-grimes
-vanishing
-cheering
-reacting
-bn
-distillery
-##quisite
-##vity
-coe
-dockyard
-massif
-##jord
-escorts
-voss
-##valent
-byte
-chopped
-hawke
-illusions
-workings
-floats
-##koto
-##vac
-kv
-annapolis
-madden
-##onus
-alvaro
-noctuidae
-##cum
-##scopic
-avenge
-steamboat
-forte
-illustrates
-erika
-##trip
-570
-dew
-nationalities
-bran
-manifested
-thirsty
-diversified
-muscled
-reborn
-##standing
-arson
-##lessness
-##dran
-##logram
-##boys
-##kushima
-##vious
-willoughby
-##phobia
-286
-alsace
-dashboard
-yuki
-##chai
-granville
-myspace
-publicized
-tricked
-##gang
-adjective
-##ater
-relic
-reorganisation
-enthusiastically
-indications
-saxe
-##lassified
-consolidate
-iec
-padua
-helplessly
-ramps
-renaming
-regulars
-pedestrians
-accents
-convicts
-inaccurate
-lowers
-mana
-##pati
-barrie
-bjp
-outta
-someplace
-berwick
-flanking
-invoked
-marrow
-sparsely
-excerpts
-clothed
-rei
-##ginal
-wept
-##straße
-##vish
-alexa
-excel
-##ptive
-membranes
-aquitaine
-creeks
-cutler
-sheppard
-implementations
-ns
-##dur
-fragrance
-budge
-concordia
-magnesium
-marcelo
-##antes
-gladly
-vibrating
-##rral
-##ggles
-montrose
-##omba
-lew
-seamus
-1630
-cocky
-##ament
-##uen
-bjorn
-##rrick
-fielder
-fluttering
-##lase
-methyl
-kimberley
-mcdowell
-reductions
-barbed
-##jic
-##tonic
-aeronautical
-condensed
-distracting
-##promising
-huffed
-##cala
-##sle
-claudius
-invincible
-missy
-pious
-balthazar
-ci
-##lang
-butte
-combo
-orson
-##dication
-myriad
-1707
-silenced
-##fed
-##rh
-coco
-netball
-yourselves
-##oza
-clarify
-heller
-peg
-durban
-etudes
-offender
-roast
-blackmail
-curvature
-##woods
-vile
-309
-illicit
-suriname
-##linson
-overture
-1685
-bubbling
-gymnast
-tucking
-##mming
-##ouin
-maldives
-##bala
-gurney
-##dda
-##eased
-##oides
-backside
-pinto
-jars
-racehorse
-tending
-##rdial
-baronetcy
-wiener
-duly
-##rke
-barbarian
-cupping
-flawed
-##thesis
-bertha
-pleistocene
-puddle
-swearing
-##nob
-##tically
-fleeting
-prostate
-amulet
-educating
-##mined
-##iti
-##tler
-75th
-jens
-respondents
-analytics
-cavaliers
-papacy
-raju
-##iente
-##ulum
-##tip
-funnel
-271
-disneyland
-##lley
-sociologist
-##iam
-2500
-faulkner
-louvre
-menon
-##dson
-276
-##ower
-afterlife
-mannheim
-peptide
-referees
-comedians
-meaningless
-##anger
-##laise
-fabrics
-hurley
-renal
-sleeps
-##bour
-##icle
-breakout
-kristin
-roadside
-animator
-clover
-disdain
-unsafe
-redesign
-##urity
-firth
-barnsley
-portage
-reset
-narrows
-268
-commandos
-expansive
-speechless
-tubular
-##lux
-essendon
-eyelashes
-smashwords
-##yad
-##bang
-##claim
-craved
-sprinted
-chet
-somme
-astor
-wrocław
-orton
-266
-bane
-##erving
-##uing
-mischief
-##amps
-##sund
-scaling
-terre
-##xious
-impairment
-offenses
-undermine
-moi
-soy
-contiguous
-arcadia
-inuit
-seam
-##tops
-macbeth
-rebelled
-##icative
-##iot
-590
-elaborated
-frs
-uniformed
-##dberg
-259
-powerless
-priscilla
-stimulated
-980
-qc
-arboretum
-frustrating
-trieste
-bullock
-##nified
-enriched
-glistening
-intern
-##adia
-locus
-nouvelle
-ollie
-ike
-lash
-starboard
-ee
-tapestry
-headlined
-hove
-rigged
-##vite
-pollock
-##yme
-thrive
-clustered
-cas
-roi
-gleamed
-olympiad
-##lino
-pressured
-regimes
-##hosis
-##lick
-ripley
-##ophone
-kickoff
-gallon
-rockwell
-##arable
-crusader
-glue
-revolutions
-scrambling
-1714
-grover
-##jure
-englishman
-aztec
-263
-contemplating
-coven
-ipad
-preach
-triumphant
-tufts
-##esian
-rotational
-##phus
-328
-falkland
-##brates
-strewn
-clarissa
-rejoin
-environmentally
-glint
-banded
-drenched
-moat
-albanians
-johor
-rr
-maestro
-malley
-nouveau
-shaded
-taxonomy
-v6
-adhere
-bunk
-airfields
-##ritan
-1741
-encompass
-remington
-tran
-##erative
-amelie
-mazda
-friar
-morals
-passions
-##zai
-breadth
-vis
-##hae
-argus
-burnham
-caressing
-insider
-rudd
-##imov
-##mini
-##rso
-italianate
-murderous
-textual
-wainwright
-armada
-bam
-weave
-timer
-##taken
-##nh
-fra
-##crest
-ardent
-salazar
-taps
-tunis
-##ntino
-allegro
-gland
-philanthropic
-##chester
-implication
-##optera
-esq
-judas
-noticeably
-wynn
-##dara
-inched
-indexed
-crises
-villiers
-bandit
-royalties
-patterned
-cupboard
-interspersed
-accessory
-isla
-kendrick
-entourage
-stitches
-##esthesia
-headwaters
-##ior
-interlude
-distraught
-draught
-1727
-##basket
-biased
-sy
-transient
-triad
-subgenus
-adapting
-kidd
-shortstop
-##umatic
-dimly
-spiked
-mcleod
-reprint
-nellie
-pretoria
-windmill
-##cek
-singled
-##mps
-273
-reunite
-##orous
-747
-bankers
-outlying
-##omp
-##ports
-##tream
-apologies
-cosmetics
-patsy
-##deh
-##ocks
-##yson
-bender
-nantes
-serene
-##nad
-lucha
-mmm
-323
-##cius
-##gli
-cmll
-coinage
-nestor
-juarez
-##rook
-smeared
-sprayed
-twitching
-sterile
-irina
-embodied
-juveniles
-enveloped
-miscellaneous
-cancers
-dq
-gulped
-luisa
-crested
-swat
-donegal
-ref
-##anov
-##acker
-hearst
-mercantile
-##lika
-doorbell
-ua
-vicki
-##alla
-##som
-bilbao
-psychologists
-stryker
-sw
-horsemen
-turkmenistan
-wits
-##national
-anson
-mathew
-screenings
-##umb
-rihanna
-##agne
-##nessy
-aisles
-##iani
-##osphere
-hines
-kenton
-saskatoon
-tasha
-truncated
-##champ
-##itan
-mildred
-advises
-fredrik
-interpreting
-inhibitors
-##athi
-spectroscopy
-##hab
-##kong
-karim
-panda
-##oia
-##nail
-##vc
-conqueror
-kgb
-leukemia
-##dity
-arrivals
-cheered
-pisa
-phosphorus
-shielded
-##riated
-mammal
-unitarian
-urgently
-chopin
-sanitary
-##mission
-spicy
-drugged
-hinges
-##tort
-tipping
-trier
-impoverished
-westchester
-##caster
-267
-epoch
-nonstop
-##gman
-##khov
-aromatic
-centrally
-cerro
-##tively
-##vio
-billions
-modulation
-sedimentary
-283
-facilitating
-outrageous
-goldstein
-##eak
-##kt
-ld
-maitland
-penultimate
-pollard
-##dance
-fleets
-spaceship
-vertebrae
-##nig
-alcoholism
-als
-recital
-##bham
-##ference
-##omics
-m2
-##bm
-trois
-##tropical
-##в
-commemorates
-##meric
-marge
-##raction
-1643
-670
-cosmetic
-ravaged
-##ige
-catastrophe
-eng
-##shida
-albrecht
-arterial
-bellamy
-decor
-harmon
-##rde
-bulbs
-synchronized
-vito
-easiest
-shetland
-shielding
-wnba
-##glers
-##ssar
-##riam
-brianna
-cumbria
-##aceous
-##rard
-cores
-thayer
-##nsk
-brood
-hilltop
-luminous
-carts
-keynote
-larkin
-logos
-##cta
-##ا
-##mund
-##quay
-lilith
-tinted
-277
-wrestle
-mobilization
-##uses
-sequential
-siam
-bloomfield
-takahashi
-274
-##ieving
-presenters
-ringo
-blazed
-witty
-##oven
-##ignant
-devastation
-haydn
-harmed
-newt
-therese
-##peed
-gershwin
-molina
-rabbis
-sudanese
-001
-innate
-restarted
-##sack
-##fus
-slices
-wb
-##shah
-enroll
-hypothetical
-hysterical
-1743
-fabio
-indefinite
-warped
-##hg
-exchanging
-525
-unsuitable
-##sboro
-gallo
-1603
-bret
-cobalt
-homemade
-##hunter
-mx
-operatives
-##dhar
-terraces
-durable
-latch
-pens
-whorls
-##ctuated
-##eaux
-billing
-ligament
-succumbed
-##gly
-regulators
-spawn
-##brick
-##stead
-filmfare
-rochelle
-##nzo
-1725
-circumstance
-saber
-supplements
-##nsky
-##tson
-crowe
-wellesley
-carrot
-##9th
-##movable
-primate
-drury
-sincerely
-topical
-##mad
-##rao
-callahan
-kyiv
-smarter
-tits
-undo
-##yeh
-announcements
-anthologies
-barrio
-nebula
-##islaus
-##shaft
-##tyn
-bodyguards
-2021
-assassinate
-barns
-emmett
-scully
-##mah
-##yd
-##eland
-##tino
-##itarian
-demoted
-gorman
-lashed
-prized
-adventist
-writ
-##gui
-alla
-invertebrates
-##ausen
-1641
-amman
-1742
-align
-healy
-redistribution
-##gf
-##rize
-insulation
-##drop
-adherents
-hezbollah
-vitro
-ferns
-yanking
-269
-php
-registering
-uppsala
-cheerleading
-confines
-mischievous
-tully
-##ross
-49th
-docked
-roam
-stipulated
-pumpkin
-##bry
-prompt
-##ezer
-blindly
-shuddering
-craftsmen
-frail
-scented
-katharine
-scramble
-shaggy
-sponge
-helix
-zaragoza
-279
-##52
-43rd
-backlash
-fontaine
-seizures
-posse
-cowan
-nonfiction
-telenovela
-wwii
-hammered
-undone
-##gpur
-encircled
-irs
-##ivation
-artefacts
-oneself
-searing
-smallpox
-##belle
-##osaurus
-shandong
-breached
-upland
-blushing
-rankin
-infinitely
-psyche
-tolerated
-docking
-evicted
-##col
-unmarked
-##lving
-gnome
-lettering
-litres
-musique
-##oint
-benevolent
-##jal
-blackened
-##anna
-mccall
-racers
-tingle
-##ocene
-##orestation
-introductions
-radically
-292
-##hiff
-##باد
-1610
-1739
-munchen
-plead
-##nka
-condo
-scissors
-##sight
-##tens
-apprehension
-##cey
-##yin
-hallmark
-watering
-formulas
-sequels
-##llas
-aggravated
-bae
-commencing
-##building
-enfield
-prohibits
-marne
-vedic
-civilized
-euclidean
-jagger
-beforehand
-blasts
-dumont
-##arney
-##nem
-740
-conversions
-hierarchical
-rios
-simulator
-##dya
-##lellan
-hedges
-oleg
-thrusts
-shadowed
-darby
-maximize
-1744
-gregorian
-##nded
-##routed
-sham
-unspecified
-##hog
-emory
-factual
-##smo
-##tp
-fooled
-##rger
-ortega
-wellness
-marlon
-##oton
-##urance
-casket
-keating
-ley
-enclave
-##ayan
-char
-influencing
-jia
-##chenko
-412
-ammonia
-erebidae
-incompatible
-violins
-cornered
-##arat
-grooves
-astronauts
-columbian
-rampant
-fabrication
-kyushu
-mahmud
-vanish
-##dern
-mesopotamia
-##lete
-ict
-##rgen
-caspian
-kenji
-pitted
-##vered
-999
-grimace
-roanoke
-tchaikovsky
-twinned
-##analysis
-##awan
-xinjiang
-arias
-clemson
-kazakh
-sizable
-1662
-##khand
-##vard
-plunge
-tatum
-vittorio
-##nden
-cholera
-##dana
-##oper
-bracing
-indifference
-projectile
-superliga
-##chee
-realises
-upgrading
-299
-porte
-retribution
-##vies
-nk
-stil
-##resses
-ama
-bureaucracy
-blackberry
-bosch
-testosterone
-collapses
-greer
-##pathic
-ioc
-fifties
-malls
-##erved
-bao
-baskets
-adolescents
-siegfried
-##osity
-##tosis
-mantra
-detecting
-existent
-fledgling
-##cchi
-dissatisfied
-gan
-telecommunication
-mingled
-sobbed
-6000
-controversies
-outdated
-taxis
-##raus
-fright
-slams
-##lham
-##fect
-##tten
-detectors
-fetal
-tanned
-##uw
-fray
-goth
-olympian
-skipping
-mandates
-scratches
-sheng
-unspoken
-hyundai
-tracey
-hotspur
-restrictive
-##buch
-americana
-mundo
-##bari
-burroughs
-diva
-vulcan
-##6th
-distinctions
-thumping
-##ngen
-mikey
-sheds
-fide
-rescues
-springsteen
-vested
-valuation
-##ece
-##ely
-pinnacle
-rake
-sylvie
-##edo
-almond
-quivering
-##irus
-alteration
-faltered
-##wad
-51st
-hydra
-ticked
-##kato
-recommends
-##dicated
-antigua
-arjun
-stagecoach
-wilfred
-trickle
-pronouns
-##pon
-aryan
-nighttime
-##anian
-gall
-pea
-stitch
-##hei
-leung
-milos
-##dini
-eritrea
-nexus
-starved
-snowfall
-kant
-parasitic
-cot
-discus
-hana
-strikers
-appleton
-kitchens
-##erina
-##partisan
-##itha
-##vius
-disclose
-metis
-##channel
-1701
-tesla
-##vera
-fitch
-1735
-blooded
-##tila
-decimal
-##tang
-##bai
-cyclones
-eun
-bottled
-peas
-pensacola
-basha
-bolivian
-crabs
-boil
-lanterns
-partridge
-roofed
-1645
-necks
-##phila
-opined
-patting
-##kla
-##lland
-chuckles
-volta
-whereupon
-##nche
-devout
-euroleague
-suicidal
-##dee
-inherently
-involuntary
-knitting
-nasser
-##hide
-puppets
-colourful
-courageous
-southend
-stills
-miraculous
-hodgson
-richer
-rochdale
-ethernet
-greta
-uniting
-prism
-umm
-##haya
-##itical
-##utation
-deterioration
-pointe
-prowess
-##ropriation
-lids
-scranton
-billings
-subcontinent
-##koff
-##scope
-brute
-kellogg
-psalms
-degraded
-##vez
-stanisław
-##ructured
-ferreira
-pun
-astonishing
-gunnar
-##yat
-arya
-prc
-gottfried
-##tight
-excursion
-##ographer
-dina
-##quil
-##nare
-huffington
-illustrious
-wilbur
-gundam
-verandah
-##zard
-naacp
-##odle
-constructive
-fjord
-kade
-##naud
-generosity
-thrilling
-baseline
-cayman
-frankish
-plastics
-accommodations
-zoological
-##fting
-cedric
-qb
-motorized
-##dome
-##otted
-squealed
-tackled
-canucks
-budgets
-situ
-asthma
-dail
-gabled
-grasslands
-whimpered
-writhing
-judgments
-##65
-minnie
-pv
-##carbon
-bananas
-grille
-domes
-monique
-odin
-maguire
-markham
-tierney
-##estra
-##chua
-libel
-poke
-speedy
-atrium
-laval
-notwithstanding
-##edly
-fai
-kala
-##sur
-robb
-##sma
-listings
-luz
-supplementary
-tianjin
-##acing
-enzo
-jd
-ric
-scanner
-croats
-transcribed
-##49
-arden
-cv
-##hair
-##raphy
-##lver
-##uy
-357
-seventies
-staggering
-alam
-horticultural
-hs
-regression
-timbers
-blasting
-##ounded
-montagu
-manipulating
-##cit
-catalytic
-1550
-troopers
-##meo
-condemnation
-fitzpatrick
-##oire
-##roved
-inexperienced
-1670
-castes
-##lative
-outing
-314
-dubois
-flicking
-quarrel
-ste
-learners
-1625
-iq
-whistled
-##class
-282
-classify
-tariffs
-temperament
-355
-folly
-liszt
-##yles
-immersed
-jordanian
-ceasefire
-apparel
-extras
-maru
-fished
-##bio
-harta
-stockport
-assortment
-craftsman
-paralysis
-transmitters
-##cola
-blindness
-##wk
-fatally
-proficiency
-solemnly
-##orno
-repairing
-amore
-groceries
-ultraviolet
-##chase
-schoolhouse
-##tua
-resurgence
-nailed
-##otype
-##×
-ruse
-saliva
-diagrams
-##tructing
-albans
-rann
-thirties
-1b
-antennas
-hilarious
-cougars
-paddington
-stats
-##eger
-breakaway
-ipod
-reza
-authorship
-prohibiting
-scoffed
-##etz
-##ttle
-conscription
-defected
-trondheim
-##fires
-ivanov
-keenan
-##adan
-##ciful
-##fb
-##slow
-locating
-##ials
-##tford
-cadiz
-basalt
-blankly
-interned
-rags
-rattling
-##tick
-carpathian
-reassured
-sync
-bum
-guildford
-iss
-staunch
-##onga
-astronomers
-sera
-sofie
-emergencies
-susquehanna
-##heard
-duc
-mastery
-vh1
-williamsburg
-bayer
-buckled
-craving
-##khan
-##rdes
-bloomington
-##write
-alton
-barbecue
-##bians
-justine
-##hri
-##ndt
-delightful
-smartphone
-newtown
-photon
-retrieval
-peugeot
-hissing
-##monium
-##orough
-flavors
-lighted
-relaunched
-tainted
-##games
-##lysis
-anarchy
-microscopic
-hopping
-adept
-evade
-evie
-##beau
-inhibit
-sinn
-adjustable
-hurst
-intuition
-wilton
-cisco
-44th
-lawful
-lowlands
-stockings
-thierry
-##dalen
-##hila
-##nai
-fates
-prank
-tb
-maison
-lobbied
-provocative
-1724
-4a
-utopia
-##qual
-carbonate
-gujarati
-purcell
-##rford
-curtiss
-##mei
-overgrown
-arenas
-mediation
-swallows
-##rnik
-respectful
-turnbull
-##hedron
-##hope
-alyssa
-ozone
-##ʻi
-ami
-gestapo
-johansson
-snooker
-canteen
-cuff
-declines
-empathy
-stigma
-##ags
-##iner
-##raine
-taxpayers
-gui
-volga
-##wright
-##copic
-lifespan
-overcame
-tattooed
-enactment
-giggles
-##ador
-##camp
-barrington
-bribe
-obligatory
-orbiting
-peng
-##enas
-elusive
-sucker
-##vating
-cong
-hardship
-empowered
-anticipating
-estrada
-cryptic
-greasy
-detainees
-planck
-sudbury
-plaid
-dod
-marriott
-kayla
-##ears
-##vb
-##zd
-mortally
-##hein
-cognition
-radha
-319
-liechtenstein
-meade
-richly
-argyle
-harpsichord
-liberalism
-trumpets
-lauded
-tyrant
-salsa
-tiled
-lear
-promoters
-reused
-slicing
-trident
-##chuk
-##gami
-##lka
-cantor
-checkpoint
-##points
-gaul
-leger
-mammalian
-##tov
-##aar
-##schaft
-doha
-frenchman
-nirvana
-##vino
-delgado
-headlining
-##eron
-##iography
-jug
-tko
-1649
-naga
-intersections
-##jia
-benfica
-nawab
-##suka
-ashford
-gulp
-##deck
-##vill
-##rug
-brentford
-frazier
-pleasures
-dunne
-potsdam
-shenzhen
-dentistry
-##tec
-flanagan
-##dorff
-##hear
-chorale
-dinah
-prem
-quezon
-##rogated
-relinquished
-sutra
-terri
-##pani
-flaps
-##rissa
-poly
-##rnet
-homme
-aback
-##eki
-linger
-womb
-##kson
-##lewood
-doorstep
-orthodoxy
-threaded
-westfield
-##rval
-dioceses
-fridays
-subsided
-##gata
-loyalists
-##biotic
-##ettes
-letterman
-lunatic
-prelate
-tenderly
-invariably
-souza
-thug
-winslow
-##otide
-furlongs
-gogh
-jeopardy
-##runa
-pegasus
-##umble
-humiliated
-standalone
-tagged
-##roller
-freshmen
-klan
-##bright
-attaining
-initiating
-transatlantic
-logged
-viz
-##uance
-1723
-combatants
-intervening
-stephane
-chieftain
-despised
-grazed
-317
-cdc
-galveston
-godzilla
-macro
-simulate
-##planes
-parades
-##esses
-960
-##ductive
-##unes
-equator
-overdose
-##cans
-##hosh
-##lifting
-joshi
-epstein
-sonora
-treacherous
-aquatics
-manchu
-responsive
-##sation
-supervisory
-##christ
-##llins
-##ibar
-##balance
-##uso
-kimball
-karlsruhe
-mab
-##emy
-ignores
-phonetic
-reuters
-spaghetti
-820
-almighty
-danzig
-rumbling
-tombstone
-designations
-lured
-outset
-##felt
-supermarkets
-##wt
-grupo
-kei
-kraft
-susanna
-##blood
-comprehension
-genealogy
-##aghan
-##verted
-redding
-##ythe
-1722
-bowing
-##pore
-##roi
-lest
-sharpened
-fulbright
-valkyrie
-sikhs
-##unds
-swans
-bouquet
-merritt
-##tage
-##venting
-commuted
-redhead
-clerks
-leasing
-cesare
-dea
-hazy
-##vances
-fledged
-greenfield
-servicemen
-##gical
-armando
-blackout
-dt
-sagged
-downloadable
-intra
-potion
-pods
-##4th
-##mism
-xp
-attendants
-gambia
-stale
-##ntine
-plump
-asteroids
-rediscovered
-buds
-flea
-hive
-##neas
-1737
-classifications
-debuts
-##eles
-olympus
-scala
-##eurs
-##gno
-##mute
-hummed
-sigismund
-visuals
-wiggled
-await
-pilasters
-clench
-sulfate
-##ances
-bellevue
-enigma
-trainee
-snort
-##sw
-clouded
-denim
-##rank
-##rder
-churning
-hartman
-lodges
-riches
-sima
-##missible
-accountable
-socrates
-regulates
-mueller
-##cr
-1702
-avoids
-solids
-himalayas
-nutrient
-pup
-##jevic
-squat
-fades
-nec
-##lates
-##pina
-##rona
-##ου
-privateer
-tequila
-##gative
-##mpton
-apt
-hornet
-immortals
-##dou
-asturias
-cleansing
-dario
-##rries
-##anta
-etymology
-servicing
-zhejiang
-##venor
-##nx
-horned
-erasmus
-rayon
-relocating
-£10
-##bags
-escalated
-promenade
-stubble
-2010s
-artisans
-axial
-liquids
-mora
-sho
-yoo
-##tsky
-bundles
-oldies
-##nally
-notification
-bastion
-##ths
-sparkle
-##lved
-1728
-leash
-pathogen
-highs
-##hmi
-immature
-880
-gonzaga
-ignatius
-mansions
-monterrey
-sweets
-bryson
-##loe
-polled
-regatta
-brightest
-pei
-rosy
-squid
-hatfield
-payroll
-addict
-meath
-cornerback
-heaviest
-lodging
-##mage
-capcom
-rippled
-##sily
-barnet
-mayhem
-ymca
-snuggled
-rousseau
-##cute
-blanchard
-284
-fragmented
-leighton
-chromosomes
-risking
-##md
-##strel
-##utter
-corinne
-coyotes
-cynical
-hiroshi
-yeomanry
-##ractive
-ebook
-grading
-mandela
-plume
-agustin
-magdalene
-##rkin
-bea
-femme
-trafford
-##coll
-##lun
-##tance
-52nd
-fourier
-upton
-##mental
-camilla
-gust
-iihf
-islamabad
-longevity
-##kala
-feldman
-netting
-##rization
-endeavour
-foraging
-mfa
-orr
-##open
-greyish
-contradiction
-graz
-##ruff
-handicapped
-marlene
-tweed
-oaxaca
-spp
-campos
-miocene
-pri
-configured
-cooks
-pluto
-cozy
-pornographic
-##entes
-70th
-fairness
-glided
-jonny
-lynne
-rounding
-sired
-##emon
-##nist
-remade
-uncover
-##mack
-complied
-lei
-newsweek
-##jured
-##parts
-##enting
-##pg
-293
-finer
-guerrillas
-athenian
-deng
-disused
-stepmother
-accuse
-gingerly
-seduction
-521
-confronting
-##walker
-##going
-gora
-nostalgia
-sabres
-virginity
-wrenched
-##minated
-syndication
-wielding
-eyre
-##56
-##gnon
-##igny
-behaved
-taxpayer
-sweeps
-##growth
-childless
-gallant
-##ywood
-amplified
-geraldine
-scrape
-##ffi
-babylonian
-fresco
-##rdan
-##kney
-##position
-1718
-restricting
-tack
-fukuoka
-osborn
-selector
-partnering
-##dlow
-318
-gnu
-kia
-tak
-whitley
-gables
-##54
-##mania
-mri
-softness
-immersion
-##bots
-##evsky
-1713
-chilling
-insignificant
-pcs
-##uis
-elites
-lina
-purported
-supplemental
-teaming
-##americana
-##dding
-##inton
-proficient
-rouen
-##nage
-##rret
-niccolo
-selects
-##bread
-fluffy
-1621
-gruff
-knotted
-mukherjee
-polgara
-thrash
-nicholls
-secluded
-smoothing
-thru
-corsica
-loaf
-whitaker
-inquiries
-##rrier
-##kam
-indochina
-289
-marlins
-myles
-peking
-##tea
-extracts
-pastry
-superhuman
-connacht
-vogel
-##ditional
-##het
-##udged
-##lash
-gloss
-quarries
-refit
-teaser
-##alic
-##gaon
-20s
-materialized
-sling
-camped
-pickering
-tung
-tracker
-pursuant
-##cide
-cranes
-soc
-##cini
-##typical
-##viere
-anhalt
-overboard
-workout
-chores
-fares
-orphaned
-stains
-##logie
-fenton
-surpassing
-joyah
-triggers
-##itte
-grandmaster
-##lass
-##lists
-clapping
-fraudulent
-ledger
-nagasaki
-##cor
-##nosis
-##tsa
-eucalyptus
-tun
-##icio
-##rney
-##tara
-dax
-heroism
-ina
-wrexham
-onboard
-unsigned
-##dates
-moshe
-galley
-winnie
-droplets
-exiles
-praises
-watered
-noodles
-##aia
-fein
-adi
-leland
-multicultural
-stink
-bingo
-comets
-erskine
-modernized
-canned
-constraint
-domestically
-chemotherapy
-featherweight
-stifled
-##mum
-darkly
-irresistible
-refreshing
-hasty
-isolate
-##oys
-kitchener
-planners
-##wehr
-cages
-yarn
-implant
-toulon
-elects
-childbirth
-yue
-##lind
-##lone
-cn
-rightful
-sportsman
-junctions
-remodeled
-specifies
-##rgh
-291
-##oons
-complimented
-##urgent
-lister
-ot
-##logic
-bequeathed
-cheekbones
-fontana
-gabby
-##dial
-amadeus
-corrugated
-maverick
-resented
-triangles
-##hered
-##usly
-nazareth
-tyrol
-1675
-assent
-poorer
-sectional
-aegean
-##cous
-296
-nylon
-ghanaian
-##egorical
-##weig
-cushions
-forbid
-fusiliers
-obstruction
-somerville
-##scia
-dime
-earrings
-elliptical
-leyte
-oder
-polymers
-timmy
-atm
-midtown
-piloted
-settles
-continual
-externally
-mayfield
-##uh
-enrichment
-henson
-keane
-persians
-1733
-benji
-braden
-pep
-324
-##efe
-contenders
-pepsi
-valet
-##isches
-298
-##asse
-##earing
-goofy
-stroll
-##amen
-authoritarian
-occurrences
-adversary
-ahmedabad
-tangent
-toppled
-dorchester
-1672
-modernism
-marxism
-islamist
-charlemagne
-exponential
-racks
-unicode
-brunette
-mbc
-pic
-skirmish
-##bund
-##lad
-##powered
-##yst
-hoisted
-messina
-shatter
-##ctum
-jedi
-vantage
-##music
-##neil
-clemens
-mahmoud
-corrupted
-authentication
-lowry
-nils
-##washed
-omnibus
-wounding
-jillian
-##itors
-##opped
-serialized
-narcotics
-handheld
-##arm
-##plicity
-intersecting
-stimulating
-##onis
-crate
-fellowships
-hemingway
-casinos
-climatic
-fordham
-copeland
-drip
-beatty
-leaflets
-robber
-brothel
-madeira
-##hedral
-sphinx
-ultrasound
-##vana
-valor
-forbade
-leonid
-villas
-##aldo
-duane
-marquez
-##cytes
-disadvantaged
-forearms
-kawasaki
-reacts
-consular
-lax
-uncles
-uphold
-##hopper
-concepcion
-dorsey
-lass
-##izan
-arching
-passageway
-1708
-researches
-tia
-internationals
-##graphs
-##opers
-distinguishes
-javanese
-divert
-##uven
-plotted
-##listic
-##rwin
-##erik
-##tify
-affirmative
-signifies
-validation
-##bson
-kari
-felicity
-georgina
-zulu
-##eros
-##rained
-##rath
-overcoming
-##dot
-argyll
-##rbin
-1734
-chiba
-ratification
-windy
-earls
-parapet
-##marks
-hunan
-pristine
-astrid
-punta
-##gart
-brodie
-##kota
-##oder
-malaga
-minerva
-rouse
-##phonic
-bellowed
-pagoda
-portals
-reclamation
-##gur
-##odies
-##⁄₄
-parentheses
-quoting
-allergic
-palette
-showcases
-benefactor
-heartland
-nonlinear
-##tness
-bladed
-cheerfully
-scans
-##ety
-##hone
-1666
-girlfriends
-pedersen
-hiram
-sous
-##liche
-##nator
-1683
-##nery
-##orio
-##umen
-bobo
-primaries
-smiley
-##cb
-unearthed
-uniformly
-fis
-metadata
-1635
-ind
-##oted
-recoil
-##titles
-##tura
-##ια
-406
-hilbert
-jamestown
-mcmillan
-tulane
-seychelles
-##frid
-antics
-coli
-fated
-stucco
-##grants
-1654
-bulky
-accolades
-arrays
-caledonian
-carnage
-optimism
-puebla
-##tative
-##cave
-enforcing
-rotherham
-seo
-dunlop
-aeronautics
-chimed
-incline
-zoning
-archduke
-hellenistic
-##oses
-##sions
-candi
-thong
-##ople
-magnate
-rustic
-##rsk
-projective
-slant
-##offs
-danes
-hollis
-vocalists
-##ammed
-congenital
-contend
-gesellschaft
-##ocating
-##pressive
-douglass
-quieter
-##cm
-##kshi
-howled
-salim
-spontaneously
-townsville
-buena
-southport
-##bold
-kato
-1638
-faerie
-stiffly
-##vus
-##rled
-297
-flawless
-realising
-taboo
-##7th
-bytes
-straightening
-356
-jena
-##hid
-##rmin
-cartwright
-berber
-bertram
-soloists
-411
-noses
-417
-coping
-fission
-hardin
-inca
-##cen
-1717
-mobilized
-vhf
-##raf
-biscuits
-curate
-##85
-##anial
-331
-gaunt
-neighbourhoods
-1540
-##abas
-blanca
-bypassed
-sockets
-behold
-coincidentally
-##bane
-nara
-shave
-splinter
-terrific
-##arion
-##erian
-commonplace
-juris
-redwood
-waistband
-boxed
-caitlin
-fingerprints
-jennie
-naturalized
-##ired
-balfour
-craters
-jody
-bungalow
-hugely
-quilt
-glitter
-pigeons
-undertaker
-bulging
-constrained
-goo
-##sil
-##akh
-assimilation
-reworked
-##person
-persuasion
-##pants
-felicia
-##cliff
-##ulent
-1732
-explodes
-##dun
-##inium
-##zic
-lyman
-vulture
-hog
-overlook
-begs
-northwards
-ow
-spoil
-##urer
-fatima
-favorably
-accumulate
-sargent
-sorority
-corresponded
-dispersal
-kochi
-toned
-##imi
-##lita
-internacional
-newfound
-##agger
-##lynn
-##rigue
-booths
-peanuts
-##eborg
-medicare
-muriel
-nur
-##uram
-crates
-millennia
-pajamas
-worsened
-##breakers
-jimi
-vanuatu
-yawned
-##udeau
-carousel
-##hony
-hurdle
-##ccus
-##mounted
-##pod
-rv
-##eche
-airship
-ambiguity
-compulsion
-recapture
-##claiming
-arthritis
-##osomal
-1667
-asserting
-ngc
-sniffing
-dade
-discontent
-glendale
-ported
-##amina
-defamation
-rammed
-##scent
-fling
-livingstone
-##fleet
-875
-##ppy
-apocalyptic
-comrade
-lcd
-##lowe
-cessna
-eine
-persecuted
-subsistence
-demi
-hoop
-reliefs
-710
-coptic
-progressing
-stemmed
-perpetrators
-1665
-priestess
-##nio
-dobson
-ebony
-rooster
-itf
-tortricidae
-##bbon
-##jian
-cleanup
-##jean
-##øy
-1721
-eighties
-taxonomic
-holiness
-##hearted
-##spar
-antilles
-showcasing
-stabilized
-##nb
-gia
-mascara
-michelangelo
-dawned
-##uria
-##vinsky
-extinguished
-fitz
-grotesque
-£100
-##fera
-##loid
-##mous
-barges
-neue
-throbbed
-cipher
-johnnie
-##a1
-##mpt
-outburst
-##swick
-spearheaded
-administrations
-c1
-heartbreak
-pixels
-pleasantly
-##enay
-lombardy
-plush
-##nsed
-bobbie
-##hly
-reapers
-tremor
-xiang
-minogue
-substantive
-hitch
-barak
-##wyl
-kwan
-##encia
-910
-obscene
-elegance
-indus
-surfer
-bribery
-conserve
-##hyllum
-##masters
-horatio
-##fat
-apes
-rebound
-psychotic
-##pour
-iteration
-##mium
-##vani
-botanic
-horribly
-antiques
-dispose
-paxton
-##hli
-##wg
-timeless
-1704
-disregard
-engraver
-hounds
-##bau
-##version
-looted
-uno
-facilitates
-groans
-masjid
-rutland
-antibody
-disqualification
-decatur
-footballers
-quake
-slacks
-48th
-rein
-scribe
-stabilize
-commits
-exemplary
-tho
-##hort
-##chison
-pantry
-traversed
-##hiti
-disrepair
-identifiable
-vibrated
-baccalaureate
-##nnis
-csa
-interviewing
-##iensis
-##raße
-greaves
-wealthiest
-343
-classed
-jogged
-£5
-##58
-##atal
-illuminating
-knicks
-respecting
-##uno
-scrubbed
-##iji
-##dles
-kruger
-moods
-growls
-raider
-silvia
-chefs
-kam
-vr
-cree
-percival
-##terol
-gunter
-counterattack
-defiant
-henan
-ze
-##rasia
-##riety
-equivalence
-submissions
-##fra
-##thor
-bautista
-mechanically
-##heater
-cornice
-herbal
-templar
-##mering
-outputs
-ruining
-ligand
-renumbered
-extravagant
-mika
-blockbuster
-eta
-insurrection
-##ilia
-darkening
-ferocious
-pianos
-strife
-kinship
-##aer
-melee
-##anor
-##iste
-##may
-##oue
-decidedly
-weep
-##jad
-##missive
-##ppel
-354
-puget
-unease
-##gnant
-1629
-hammering
-kassel
-ob
-wessex
-##lga
-bromwich
-egan
-paranoia
-utilization
-##atable
-##idad
-contradictory
-provoke
-##ols
-##ouring
-##tangled
-knesset
-##very
-##lette
-plumbing
-##sden
-##¹
-greensboro
-occult
-sniff
-338
-zev
-beaming
-gamer
-haggard
-mahal
-##olt
-##pins
-mendes
-utmost
-briefing
-gunnery
-##gut
-##pher
-##zh
-##rok
-1679
-khalifa
-sonya
-##boot
-principals
-urbana
-wiring
-##liffe
-##minating
-##rrado
-dahl
-nyu
-skepticism
-np
-townspeople
-ithaca
-lobster
-somethin
-##fur
-##arina
-##−1
-freighter
-zimmerman
-biceps
-contractual
-##herton
-amend
-hurrying
-subconscious
-##anal
-336
-meng
-clermont
-spawning
-##eia
-##lub
-dignitaries
-impetus
-snacks
-spotting
-twigs
-##bilis
-##cz
-##ouk
-libertadores
-nic
-skylar
-##aina
-##firm
-gustave
-asean
-##anum
-dieter
-legislatures
-flirt
-bromley
-trolls
-umar
-##bbies
-##tyle
-blah
-parc
-bridgeport
-crank
-negligence
-##nction
-46th
-constantin
-molded
-bandages
-seriousness
-00pm
-siegel
-carpets
-compartments
-upbeat
-statehood
-##dner
-##edging
-marko
-730
-platt
-##hane
-paving
-##iy
-1738
-abbess
-impatience
-limousine
-nbl
-##talk
-441
-lucille
-mojo
-nightfall
-robbers
-##nais
-karel
-brisk
-calves
-replicate
-ascribed
-telescopes
-##olf
-intimidated
-##reen
-ballast
-specialization
-##sit
-aerodynamic
-caliphate
-rainer
-visionary
-##arded
-epsilon
-##aday
-##onte
-aggregation
-auditory
-boosted
-reunification
-kathmandu
-loco
-robyn
-402
-acknowledges
-appointing
-humanoid
-newell
-redeveloped
-restraints
-##tained
-barbarians
-chopper
-1609
-italiana
-##lez
-##lho
-investigates
-wrestlemania
-##anies
-##bib
-690
-##falls
-creaked
-dragoons
-gravely
-minions
-stupidity
-volley
-##harat
-##week
-musik
-##eries
-##uously
-fungal
-massimo
-semantics
-malvern
-##ahl
-##pee
-discourage
-embryo
-imperialism
-1910s
-profoundly
-##ddled
-jiangsu
-sparkled
-stat
-##holz
-sweatshirt
-tobin
-##iction
-sneered
-##cheon
-##oit
-brit
-causal
-smyth
-##neuve
-diffuse
-perrin
-silvio
-##ipes
-##recht
-detonated
-iqbal
-selma
-##nism
-##zumi
-roasted
-##riders
-tay
-##ados
-##mament
-##mut
-##rud
-840
-completes
-nipples
-cfa
-flavour
-hirsch
-##laus
-calderon
-sneakers
-moravian
-##ksha
-1622
-rq
-294
-##imeters
-bodo
-##isance
-##pre
-##ronia
-anatomical
-excerpt
-##lke
-dh
-kunst
-##tablished
-##scoe
-biomass
-panted
-unharmed
-gael
-housemates
-montpellier
-##59
-coa
-rodents
-tonic
-hickory
-singleton
-##taro
-451
-1719
-aldo
-breaststroke
-dempsey
-och
-rocco
-##cuit
-merton
-dissemination
-midsummer
-serials
-##idi
-haji
-polynomials
-##rdon
-gs
-enoch
-prematurely
-shutter
-taunton
-£3
-##grating
-##inates
-archangel
-harassed
-##asco
-326
-archway
-dazzling
-##ecin
-1736
-sumo
-wat
-##kovich
-1086
-honneur
-##ently
-##nostic
-##ttal
-##idon
-1605
-403
-1716
-blogger
-rents
-##gnan
-hires
-##ikh
-##dant
-howie
-##rons
-handler
-retracted
-shocks
-1632
-arun
-duluth
-kepler
-trumpeter
-##lary
-peeking
-seasoned
-trooper
-##mara
-laszlo
-##iciencies
-##rti
-heterosexual
-##inatory
-##ssion
-indira
-jogging
-##inga
-##lism
-beit
-dissatisfaction
-malice
-##ately
-nedra
-peeling
-##rgeon
-47th
-stadiums
-475
-vertigo
-##ains
-iced
-restroom
-##plify
-##tub
-illustrating
-pear
-##chner
-##sibility
-inorganic
-rappers
-receipts
-watery
-##kura
-lucinda
-##oulos
-reintroduced
-##8th
-##tched
-gracefully
-saxons
-nutritional
-wastewater
-rained
-favourites
-bedrock
-fisted
-hallways
-likeness
-upscale
-##lateral
-1580
-blinds
-prequel
-##pps
-##tama
-deter
-humiliating
-restraining
-tn
-vents
-1659
-laundering
-recess
-rosary
-tractors
-coulter
-federer
-##ifiers
-##plin
-persistence
-##quitable
-geschichte
-pendulum
-quakers
-##beam
-bassett
-pictorial
-buffet
-koln
-##sitor
-drills
-reciprocal
-shooters
-##57
-##cton
-##tees
-converge
-pip
-dmitri
-donnelly
-yamamoto
-aqua
-azores
-demographics
-hypnotic
-spitfire
-suspend
-wryly
-roderick
-##rran
-sebastien
-##asurable
-mavericks
-##fles
-##200
-himalayan
-prodigy
-##iance
-transvaal
-demonstrators
-handcuffs
-dodged
-mcnamara
-sublime
-1726
-crazed
-##efined
-##till
-ivo
-pondered
-reconciled
-shrill
-sava
-##duk
-bal
-cad
-heresy
-jaipur
-goran
-##nished
-341
-lux
-shelly
-whitehall
-##hre
-israelis
-peacekeeping
-##wled
-1703
-demetrius
-ousted
-##arians
-##zos
-beale
-anwar
-backstroke
-raged
-shrinking
-cremated
-##yck
-benign
-towing
-wadi
-darmstadt
-landfill
-parana
-soothe
-colleen
-sidewalks
-mayfair
-tumble
-hepatitis
-ferrer
-superstructure
-##gingly
-##urse
-##wee
-anthropological
-translators
-##mies
-closeness
-hooves
-##pw
-mondays
-##roll
-##vita
-landscaping
-##urized
-purification
-sock
-thorns
-thwarted
-jalan
-tiberius
-##taka
-saline
-##rito
-confidently
-khyber
-sculptors
-##ij
-brahms
-hammersmith
-inspectors
-battista
-fivb
-fragmentation
-hackney
-##uls
-arresting
-exercising
-antoinette
-bedfordshire
-##zily
-dyed
-##hema
-1656
-racetrack
-variability
-##tique
-1655
-austrians
-deteriorating
-madman
-theorists
-aix
-lehman
-weathered
-1731
-decreed
-eruptions
-1729
-flaw
-quinlan
-sorbonne
-flutes
-nunez
-1711
-adored
-downwards
-fable
-rasped
-1712
-moritz
-mouthful
-renegade
-shivers
-stunts
-dysfunction
-restrain
-translit
-327
-pancakes
-##avio
-##cision
-##tray
-351
-vial
-##lden
-bain
-##maid
-##oxide
-chihuahua
-malacca
-vimes
-##rba
-##rnier
-1664
-donnie
-plaques
-##ually
-337
-bangs
-floppy
-huntsville
-loretta
-nikolay
-##otte
-eater
-handgun
-ubiquitous
-##hett
-eras
-zodiac
-1634
-##omorphic
-1820s
-##zog
-cochran
-##bula
-##lithic
-warring
-##rada
-dalai
-excused
-blazers
-mcconnell
-reeling
-bot
-este
-##abi
-geese
-hoax
-taxon
-##bla
-guitarists
-##icon
-condemning
-hunts
-inversion
-moffat
-taekwondo
-##lvis
-1624
-stammered
-##rest
-##rzy
-sousa
-fundraiser
-marylebone
-navigable
-uptown
-cabbage
-daniela
-salman
-shitty
-whimper
-##kian
-##utive
-programmers
-protections
-rm
-##rmi
-##rued
-forceful
-##enes
-fuss
-##tao
-##wash
-brat
-oppressive
-reykjavik
-spartak
-ticking
-##inkles
-##kiewicz
-adolph
-horst
-maui
-protege
-straighten
-cpc
-landau
-concourse
-clements
-resultant
-##ando
-imaginative
-joo
-reactivated
-##rem
-##ffled
-##uising
-consultative
-##guide
-flop
-kaitlyn
-mergers
-parenting
-somber
-##vron
-supervise
-vidhan
-##imum
-courtship
-exemplified
-harmonies
-medallist
-refining
-##rrow
-##ка
-amara
-##hum
-780
-goalscorer
-sited
-overshadowed
-rohan
-displeasure
-secretive
-multiplied
-osman
-##orth
-engravings
-padre
-##kali
-##veda
-miniatures
-mis
-##yala
-clap
-pali
-rook
-##cana
-1692
-57th
-antennae
-astro
-oskar
-1628
-bulldog
-crotch
-hackett
-yucatan
-##sure
-amplifiers
-brno
-ferrara
-migrating
-##gree
-thanking
-turing
-##eza
-mccann
-ting
-andersson
-onslaught
-gaines
-ganga
-incense
-standardization
-##mation
-sentai
-scuba
-stuffing
-turquoise
-waivers
-alloys
-##vitt
-regaining
-vaults
-##clops
-##gizing
-digger
-furry
-memorabilia
-probing
-##iad
-payton
-rec
-deutschland
-filippo
-opaque
-seamen
-zenith
-afrikaans
-##filtration
-disciplined
-inspirational
-##merie
-banco
-confuse
-grafton
-tod
-##dgets
-championed
-simi
-anomaly
-biplane
-##ceptive
-electrode
-##para
-1697
-cleavage
-crossbow
-swirl
-informant
-##lars
-##osta
-afi
-bonfire
-spec
-##oux
-lakeside
-slump
-##culus
-##lais
-##qvist
-##rrigan
-1016
-facades
-borg
-inwardly
-cervical
-xl
-pointedly
-050
-stabilization
-##odon
-chests
-1699
-hacked
-ctv
-orthogonal
-suzy
-##lastic
-gaulle
-jacobite
-rearview
-##cam
-##erted
-ashby
-##drik
-##igate
-##mise
-##zbek
-affectionately
-canine
-disperse
-latham
-##istles
-##ivar
-spielberg
-##orin
-##idium
-ezekiel
-cid
-##sg
-durga
-middletown
-##cina
-customized
-frontiers
-harden
-##etano
-##zzy
-1604
-bolsheviks
-##66
-coloration
-yoko
-##bedo
-briefs
-slabs
-debra
-liquidation
-plumage
-##oin
-blossoms
-dementia
-subsidy
-1611
-proctor
-relational
-jerseys
-parochial
-ter
-##ici
-esa
-peshawar
-cavalier
-loren
-cpi
-idiots
-shamrock
-1646
-dutton
-malabar
-mustache
-##endez
-##ocytes
-referencing
-terminates
-marche
-yarmouth
-##sop
-acton
-mated
-seton
-subtly
-baptised
-beige
-extremes
-jolted
-kristina
-telecast
-##actic
-safeguard
-waldo
-##baldi
-##bular
-endeavors
-sloppy
-subterranean
-##ensburg
-##itung
-delicately
-pigment
-tq
-##scu
-1626
-##ound
-collisions
-coveted
-herds
-##personal
-##meister
-##nberger
-chopra
-##ricting
-abnormalities
-defective
-galician
-lucie
-##dilly
-alligator
-likened
-##genase
-burundi
-clears
-complexion
-derelict
-deafening
-diablo
-fingered
-champaign
-dogg
-enlist
-isotope
-labeling
-mrna
-##erre
-brilliance
-marvelous
-##ayo
-1652
-crawley
-ether
-footed
-dwellers
-deserts
-hamish
-rubs
-warlock
-skimmed
-##lizer
-870
-buick
-embark
-heraldic
-irregularities
-##ajan
-kiara
-##kulam
-##ieg
-antigen
-kowalski
-##lge
-oakley
-visitation
-##mbit
-vt
-##suit
-1570
-murderers
-##miento
-##rites
-chimneys
-##sling
-condemn
-custer
-exchequer
-havre
-##ghi
-fluctuations
-##rations
-dfb
-hendricks
-vaccines
-##tarian
-nietzsche
-biking
-juicy
-##duced
-brooding
-scrolling
-selangor
-##ragan
-352
-annum
-boomed
-seminole
-sugarcane
-##dna
-departmental
-dismissing
-innsbruck
-arteries
-ashok
-batavia
-daze
-kun
-overtook
-##rga
-##tlan
-beheaded
-gaddafi
-holm
-electronically
-faulty
-galilee
-fractures
-kobayashi
-##lized
-gunmen
-magma
-aramaic
-mala
-eastenders
-inference
-messengers
-bf
-##qu
-407
-bathrooms
-##vere
-1658
-flashbacks
-ideally
-misunderstood
-##jali
-##weather
-mendez
-##grounds
-505
-uncanny
-##iii
-1709
-friendships
-##nbc
-sacrament
-accommodated
-reiterated
-logistical
-pebbles
-thumped
-##escence
-administering
-decrees
-drafts
-##flight
-##cased
-##tula
-futuristic
-picket
-intimidation
-winthrop
-##fahan
-interfered
-339
-afar
-francoise
-morally
-uta
-cochin
-croft
-dwarfs
-##bruck
-##dents
-##nami
-biker
-##hner
-##meral
-nano
-##isen
-##ometric
-##pres
-##ан
-brightened
-meek
-parcels
-securely
-gunners
-##jhl
-##zko
-agile
-hysteria
-##lten
-##rcus
-bukit
-champs
-chevy
-cuckoo
-leith
-sadler
-theologians
-welded
-##section
-1663
-jj
-plurality
-xander
-##rooms
-##formed
-shredded
-temps
-intimately
-pau
-tormented
-##lok
-##stellar
-1618
-charred
-ems
-essen
-##mmel
-alarms
-spraying
-ascot
-blooms
-twinkle
-##abia
-##apes
-internment
-obsidian
-##chaft
-snoop
-##dav
-##ooping
-malibu
-##tension
-quiver
-##itia
-hays
-mcintosh
-travers
-walsall
-##ffie
-1623
-beverley
-schwarz
-plunging
-structurally
-m3
-rosenthal
-vikram
-##tsk
-770
-ghz
-##onda
-##tiv
-chalmers
-groningen
-pew
-reckon
-unicef
-##rvis
-55th
-##gni
-1651
-sulawesi
-avila
-cai
-metaphysical
-screwing
-turbulence
-##mberg
-augusto
-samba
-56th
-baffled
-momentary
-toxin
-##urian
-##wani
-aachen
-condoms
-dali
-steppe
-##3d
-##app
-##oed
-##year
-adolescence
-dauphin
-electrically
-inaccessible
-microscopy
-nikita
-##ega
-atv
-##cel
-##enter
-##oles
-##oteric
-##ы
-accountants
-punishments
-wrongly
-bribes
-adventurous
-clinch
-flinders
-southland
-##hem
-##kata
-gough
-##ciency
-lads
-soared
-##ה
-undergoes
-deformation
-outlawed
-rubbish
-##arus
-##mussen
-##nidae
-##rzburg
-arcs
-##ingdon
-##tituted
-1695
-wheelbase
-wheeling
-bombardier
-campground
-zebra
-##lices
-##oj
-##bain
-lullaby
-##ecure
-donetsk
-wylie
-grenada
-##arding
-##ης
-squinting
-eireann
-opposes
-##andra
-maximal
-runes
-##broken
-##cuting
-##iface
-##ror
-##rosis
-additive
-britney
-adultery
-triggering
-##drome
-detrimental
-aarhus
-containment
-jc
-swapped
-vichy
-##ioms
-madly
-##oric
-##rag
-brant
-##ckey
-##trix
-1560
-1612
-broughton
-rustling
-##stems
-##uder
-asbestos
-mentoring
-##nivorous
-finley
-leaps
-##isan
-apical
-pry
-slits
-substitutes
-##dict
-intuitive
-fantasia
-insistent
-unreasonable
-##igen
-##vna
-domed
-hannover
-margot
-ponder
-##zziness
-impromptu
-jian
-lc
-rampage
-stemming
-##eft
-andrey
-gerais
-whichever
-amnesia
-appropriated
-anzac
-clicks
-modifying
-ultimatum
-cambrian
-maids
-verve
-yellowstone
-##mbs
-conservatoire
-##scribe
-adherence
-dinners
-spectra
-imperfect
-mysteriously
-sidekick
-tatar
-tuba
-##aks
-##ifolia
-distrust
-##athan
-##zle
-c2
-ronin
-zac
-##pse
-celaena
-instrumentalist
-scents
-skopje
-##mbling
-comical
-compensated
-vidal
-condor
-intersect
-jingle
-wavelengths
-##urrent
-mcqueen
-##izzly
-carp
-weasel
-422
-kanye
-militias
-postdoctoral
-eugen
-gunslinger
-##ɛ
-faux
-hospice
-##for
-appalled
-derivation
-dwarves
-##elis
-dilapidated
-##folk
-astoria
-philology
-##lwyn
-##otho
-##saka
-inducing
-philanthropy
-##bf
-##itative
-geek
-markedly
-sql
-##yce
-bessie
-indices
-rn
-##flict
-495
-frowns
-resolving
-weightlifting
-tugs
-cleric
-contentious
-1653
-mania
-rms
-##miya
-##reate
-##ruck
-##tucket
-bien
-eels
-marek
-##ayton
-##cence
-discreet
-unofficially
-##ife
-leaks
-##bber
-1705
-332
-dung
-compressor
-hillsborough
-pandit
-shillings
-distal
-##skin
-381
-##tat
-##you
-nosed
-##nir
-mangrove
-undeveloped
-##idia
-textures
-##inho
-##500
-##rise
-ae
-irritating
-nay
-amazingly
-bancroft
-apologetic
-compassionate
-kata
-symphonies
-##lovic
-airspace
-##lch
-930
-gifford
-precautions
-fulfillment
-sevilla
-vulgar
-martinique
-##urities
-looting
-piccolo
-tidy
-##dermott
-quadrant
-armchair
-incomes
-mathematicians
-stampede
-nilsson
-##inking
-##scan
-foo
-quarterfinal
-##ostal
-shang
-shouldered
-squirrels
-##owe
-344
-vinegar
-##bner
-##rchy
-##systems
-delaying
-##trics
-ars
-dwyer
-rhapsody
-sponsoring
-##gration
-bipolar
-cinder
-starters
-##olio
-##urst
-421
-signage
-##nty
-aground
-figurative
-mons
-acquaintances
-duets
-erroneously
-soyuz
-elliptic
-recreated
-##cultural
-##quette
-##ssed
-##tma
-##zcz
-moderator
-scares
-##itaire
-##stones
-##udence
-juniper
-sighting
-##just
-##nsen
-britten
-calabria
-ry
-bop
-cramer
-forsyth
-stillness
-##л
-airmen
-gathers
-unfit
-##umber
-##upt
-taunting
-##rip
-seeker
-streamlined
-##bution
-holster
-schumann
-tread
-vox
-##gano
-##onzo
-strive
-dil
-reforming
-covent
-newbury
-predicting
-##orro
-decorate
-tre
-##puted
-andover
-ie
-asahi
-dept
-dunkirk
-gills
-##tori
-buren
-huskies
-##stis
-##stov
-abstracts
-bets
-loosen
-##opa
-1682
-yearning
-##glio
-##sir
-berman
-effortlessly
-enamel
-napoli
-persist
-##peration
-##uez
-attache
-elisa
-b1
-invitations
-##kic
-accelerating
-reindeer
-boardwalk
-clutches
-nelly
-polka
-starbucks
-##kei
-adamant
-huey
-lough
-unbroken
-adventurer
-embroidery
-inspecting
-stanza
-##ducted
-naia
-taluka
-##pone
-##roids
-chases
-deprivation
-florian
-##jing
-##ppet
-earthly
-##lib
-##ssee
-colossal
-foreigner
-vet
-freaks
-patrice
-rosewood
-triassic
-upstate
-##pkins
-dominates
-ata
-chants
-ks
-vo
-##400
-##bley
-##raya
-##rmed
-555
-agra
-infiltrate
-##ailing
-##ilation
-##tzer
-##uppe
-##werk
-binoculars
-enthusiast
-fujian
-squeak
-##avs
-abolitionist
-almeida
-boredom
-hampstead
-marsden
-rations
-##ands
-inflated
-334
-bonuses
-rosalie
-patna
-##rco
-329
-detachments
-penitentiary
-54th
-flourishing
-woolf
-##dion
-##etched
-papyrus
-##lster
-##nsor
-##toy
-bobbed
-dismounted
-endelle
-inhuman
-motorola
-tbs
-wince
-wreath
-##ticus
-hideout
-inspections
-sanjay
-disgrace
-infused
-pudding
-stalks
-##urbed
-arsenic
-leases
-##hyl
-##rrard
-collarbone
-##waite
-##wil
-dowry
-##bant
-##edance
-genealogical
-nitrate
-salamanca
-scandals
-thyroid
-necessitated
-##!
-##"
-###
-##$
-##%
-##&
-##'
-##(
-##)
-##*
-##+
-##,
-##-
-##.
-##/
-##:
-##;
-##<
-##=
-##>
-##?
-##@
-##[
-##\
-##]
-##^
-##_
-##`
-##{
-##|
-##}
-##~
-##¡
-##¢
-##£
-##¤
-##¥
-##¦
-##§
-##¨
-##©
-##ª
-##«
-##¬
-##®
-##±
-##´
-##µ
-##¶
-##·
-##º
-##»
-##¼
-##¾
-##¿
-##æ
-##ð
-##÷
-##þ
-##đ
-##ħ
-##ŋ
-##œ
-##ƒ
-##ɐ
-##ɑ
-##ɒ
-##ɔ
-##ɕ
-##ə
-##ɡ
-##ɣ
-##ɨ
-##ɪ
-##ɫ
-##ɬ
-##ɯ
-##ɲ
-##ɴ
-##ɹ
-##ɾ
-##ʀ
-##ʁ
-##ʂ
-##ʃ
-##ʉ
-##ʊ
-##ʋ
-##ʌ
-##ʎ
-##ʐ
-##ʑ
-##ʒ
-##ʔ
-##ʰ
-##ʲ
-##ʳ
-##ʷ
-##ʸ
-##ʻ
-##ʼ
-##ʾ
-##ʿ
-##ˈ
-##ˡ
-##ˢ
-##ˣ
-##ˤ
-##β
-##γ
-##δ
-##ε
-##ζ
-##θ
-##κ
-##λ
-##μ
-##ξ
-##ο
-##π
-##ρ
-##σ
-##τ
-##υ
-##φ
-##χ
-##ψ
-##ω
-##б
-##г
-##д
-##ж
-##з
-##м
-##п
-##с
-##у
-##ф
-##х
-##ц
-##ч
-##ш
-##щ
-##ъ
-##э
-##ю
-##ђ
-##є
-##і
-##ј
-##љ
-##њ
-##ћ
-##ӏ
-##ա
-##բ
-##գ
-##դ
-##ե
-##թ
-##ի
-##լ
-##կ
-##հ
-##մ
-##յ
-##ն
-##ո
-##պ
-##ս
-##վ
-##տ
-##ր
-##ւ
-##ք
-##־
-##א
-##ב
-##ג
-##ד
-##ו
-##ז
-##ח
-##ט
-##י
-##ך
-##כ
-##ל
-##ם
-##מ
-##ן
-##נ
-##ס
-##ע
-##ף
-##פ
-##ץ
-##צ
-##ק
-##ר
-##ש
-##ת
-##،
-##ء
-##ب
-##ت
-##ث
-##ج
-##ح
-##خ
-##ذ
-##ز
-##س
-##ش
-##ص
-##ض
-##ط
-##ظ
-##ع
-##غ
-##ـ
-##ف
-##ق
-##ك
-##و
-##ى
-##ٹ
-##پ
-##چ
-##ک
-##گ
-##ں
-##ھ
-##ہ
-##ے
-##अ
-##आ
-##उ
-##ए
-##क
-##ख
-##ग
-##च
-##ज
-##ट
-##ड
-##ण
-##त
-##थ
-##द
-##ध
-##न
-##प
-##ब
-##भ
-##म
-##य
-##र
-##ल
-##व
-##श
-##ष
-##स
-##ह
-##ा
-##ि
-##ी
-##ो
-##।
-##॥
-##ং
-##অ
-##আ
-##ই
-##উ
-##এ
-##ও
-##ক
-##খ
-##গ
-##চ
-##ছ
-##জ
-##ট
-##ড
-##ণ
-##ত
-##থ
-##দ
-##ধ
-##ন
-##প
-##ব
-##ভ
-##ম
-##য
-##র
-##ল
-##শ
-##ষ
-##স
-##হ
-##া
-##ি
-##ী
-##ে
-##க
-##ச
-##ட
-##த
-##ந
-##ன
-##ப
-##ம
-##ய
-##ர
-##ல
-##ள
-##வ
-##ா
-##ி
-##ு
-##ே
-##ை
-##ನ
-##ರ
-##ಾ
-##ක
-##ය
-##ර
-##ල
-##ව
-##ා
-##ก
-##ง
-##ต
-##ท
-##น
-##พ
-##ม
-##ย
-##ร
-##ล
-##ว
-##ส
-##อ
-##า
-##เ
-##་
-##།
-##ག
-##ང
-##ད
-##ན
-##པ
-##བ
-##མ
-##འ
-##ར
-##ལ
-##ས
-##မ
-##ა
-##ბ
-##გ
-##დ
-##ე
-##ვ
-##თ
-##ი
-##კ
-##ლ
-##მ
-##ნ
-##ო
-##რ
-##ს
-##ტ
-##უ
-##ᄀ
-##ᄂ
-##ᄃ
-##ᄅ
-##ᄆ
-##ᄇ
-##ᄉ
-##ᄊ
-##ᄋ
-##ᄌ
-##ᄎ
-##ᄏ
-##ᄐ
-##ᄑ
-##ᄒ
-##ᅡ
-##ᅢ
-##ᅥ
-##ᅦ
-##ᅧ
-##ᅩ
-##ᅪ
-##ᅭ
-##ᅮ
-##ᅯ
-##ᅲ
-##ᅳ
-##ᅴ
-##ᅵ
-##ᆨ
-##ᆫ
-##ᆯ
-##ᆷ
-##ᆸ
-##ᆼ
-##ᴬ
-##ᴮ
-##ᴰ
-##ᴵ
-##ᴺ
-##ᵀ
-##ᵃ
-##ᵇ
-##ᵈ
-##ᵉ
-##ᵍ
-##ᵏ
-##ᵐ
-##ᵒ
-##ᵖ
-##ᵗ
-##ᵘ
-##ᵣ
-##ᵤ
-##ᵥ
-##ᶜ
-##ᶠ
-##‐
-##‑
-##‒
-##–
-##—
-##―
-##‖
-##‘
-##’
-##‚
-##“
-##”
-##„
-##†
-##‡
-##•
-##…
-##‰
-##′
-##″
-##›
-##‿
-##⁄
-##⁰
-##ⁱ
-##⁴
-##⁵
-##⁶
-##⁷
-##⁸
-##⁹
-##⁻
-##ⁿ
-##₅
-##₆
-##₇
-##₈
-##₉
-##₊
-##₍
-##₎
-##ₐ
-##ₑ
-##ₒ
-##ₓ
-##ₕ
-##ₖ
-##ₗ
-##ₘ
-##ₚ
-##ₛ
-##ₜ
-##₤
-##₩
-##€
-##₱
-##₹
-##ℓ
-##№
-##ℝ
-##™
-##⅓
-##⅔
-##←
-##↑
-##→
-##↓
-##↔
-##↦
-##⇄
-##⇌
-##⇒
-##∂
-##∅
-##∆
-##∇
-##∈
-##∗
-##∘
-##√
-##∞
-##∧
-##∨
-##∩
-##∪
-##≈
-##≡
-##≤
-##≥
-##⊂
-##⊆
-##⊕
-##⊗
-##⋅
-##─
-##│
-##■
-##▪
-##●
-##★
-##☆
-##☉
-##♠
-##♣
-##♥
-##♦
-##♯
-##⟨
-##⟩
-##ⱼ
-##⺩
-##⺼
-##⽥
-##、
-##。
-##〈
-##〉
-##《
-##》
-##「
-##」
-##『
-##』
-##〜
-##あ
-##い
-##う
-##え
-##お
-##か
-##き
-##く
-##け
-##こ
-##さ
-##し
-##す
-##せ
-##そ
-##た
-##ち
-##っ
-##つ
-##て
-##と
-##な
-##に
-##ぬ
-##ね
-##の
-##は
-##ひ
-##ふ
-##へ
-##ほ
-##ま
-##み
-##む
-##め
-##も
-##や
-##ゆ
-##よ
-##ら
-##り
-##る
-##れ
-##ろ
-##を
-##ん
-##ァ
-##ア
-##ィ
-##イ
-##ウ
-##ェ
-##エ
-##オ
-##カ
-##キ
-##ク
-##ケ
-##コ
-##サ
-##シ
-##ス
-##セ
-##タ
-##チ
-##ッ
-##ツ
-##テ
-##ト
-##ナ
-##ニ
-##ノ
-##ハ
-##ヒ
-##フ
-##ヘ
-##ホ
-##マ
-##ミ
-##ム
-##メ
-##モ
-##ャ
-##ュ
-##ョ
-##ラ
-##リ
-##ル
-##レ
-##ロ
-##ワ
-##ン
-##・
-##ー
-##一
-##三
-##上
-##下
-##不
-##世
-##中
-##主
-##久
-##之
-##也
-##事
-##二
-##五
-##井
-##京
-##人
-##亻
-##仁
-##介
-##代
-##仮
-##伊
-##会
-##佐
-##侍
-##保
-##信
-##健
-##元
-##光
-##八
-##公
-##内
-##出
-##分
-##前
-##劉
-##力
-##加
-##勝
-##北
-##区
-##十
-##千
-##南
-##博
-##原
-##口
-##古
-##史
-##司
-##合
-##吉
-##同
-##名
-##和
-##囗
-##四
-##国
-##國
-##土
-##地
-##坂
-##城
-##堂
-##場
-##士
-##夏
-##外
-##大
-##天
-##太
-##夫
-##奈
-##女
-##子
-##学
-##宀
-##宇
-##安
-##宗
-##定
-##宣
-##宮
-##家
-##宿
-##寺
-##將
-##小
-##尚
-##山
-##岡
-##島
-##崎
-##川
-##州
-##巿
-##帝
-##平
-##年
-##幸
-##广
-##弘
-##張
-##彳
-##後
-##御
-##德
-##心
-##忄
-##志
-##忠
-##愛
-##成
-##我
-##戦
-##戸
-##手
-##扌
-##政
-##文
-##新
-##方
-##日
-##明
-##星
-##春
-##昭
-##智
-##曲
-##書
-##月
-##有
-##朝
-##木
-##本
-##李
-##村
-##東
-##松
-##林
-##森
-##楊
-##樹
-##橋
-##歌
-##止
-##正
-##武
-##比
-##氏
-##民
-##水
-##氵
-##氷
-##永
-##江
-##沢
-##河
-##治
-##法
-##海
-##清
-##漢
-##瀬
-##火
-##版
-##犬
-##王
-##生
-##田
-##男
-##疒
-##発
-##白
-##的
-##皇
-##目
-##相
-##省
-##真
-##石
-##示
-##社
-##神
-##福
-##禾
-##秀
-##秋
-##空
-##立
-##章
-##竹
-##糹
-##美
-##義
-##耳
-##良
-##艹
-##花
-##英
-##華
-##葉
-##藤
-##行
-##街
-##西
-##見
-##訁
-##語
-##谷
-##貝
-##貴
-##車
-##軍
-##辶
-##道
-##郎
-##郡
-##部
-##都
-##里
-##野
-##金
-##鈴
-##镇
-##長
-##門
-##間
-##阝
-##阿
-##陳
-##陽
-##雄
-##青
-##面
-##風
-##食
-##香
-##馬
-##高
-##龍
-##龸
-##ﬁ
-##ﬂ
-##！
-##（
-##）
-##，
-##－
-##．
-##／
-##：
-##？
-##～
diff --git a/AVLFormer/data_prepro/create_image_frame_tsv.py b/AVLFormer/data_prepro/create_image_frame_tsv.py
deleted file mode 100644
index e2ee280..0000000
--- a/AVLFormer/data_prepro/create_image_frame_tsv.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import argparse
-import json
-import multiprocessing as mp
-import os
-import os.path as op
-from pathlib import Path
-import sys
-
-import numpy as np
-from tqdm import tqdm
-
-pythonpath = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-print(pythonpath)
-sys.path.insert(0, pythonpath)
-import base64
-
-import cv2
-from src.utils.tsv_file_ops import tsv_writer
-
-
-def resize_and_to_binary(img_path, target_image_size):
-    if img_path is None:
-        if target_image_size < 0:
-            target_image_size = 256
-        resized = np.zeros((target_image_size, target_image_size, 3),
-                           dtype="uint8")
-        s = (target_image_size, target_image_size)
-    else:
-        # im = Image.open(img_path)
-        im = cv2.imread(img_path)
-        height, width, channels = im.shape
-        s = (width, height)
-        if target_image_size > 0:
-            s = min(width, height)
-
-            r = target_image_size / s
-            s = (round(r * width), round(r * height))
-            # im = im.resize(s)
-            resized = cv2.resize(im, s)
-        else:
-            resized = im
-
-    # binary = io.BytesIO()
-    # im.save(binary, format='JPEG')
-    # binary = binary.getvalue()
-    binary = cv2.imencode('.jpg', resized)[1].tobytes()
-    encoded_base64 = base64.b64encode(binary)
-    return encoded_base64, s
-
-
-def load_tsv_to_mem(tsv_file, sep='\t'):
-    data = []
-    with open(tsv_file, 'r') as fp:
-        for _, line in enumerate(fp):
-            data.append([x.strip() for x in line.split(sep)])
-    return data
-
-
-def get_image_binaries(list_of_paths, image_size=56):
-    batch = []
-    is_None = [v is None for v in list_of_paths]
-    assert not any(is_None) or all(is_None)
-    for img_path in list_of_paths:
-        if img_path is None or isinstance(img_path, str):
-            x, shape = resize_and_to_binary(img_path,
-                                            target_image_size=image_size)
-        else:
-            raise ValueError(f'img_path not str, but {type(img_path)}')
-        batch.append(x)
-    return batch, shape
-
-
-def prepare_single_video_frames(vid_path, num_frames=32):
-    previous_image_path = None
-    images = []
-    local_data_path = vid_path.replace("datasets", "_datasets")
-    if not op.exists(local_data_path) and not op.exists(vid_path):
-        # print(f'{vid_path} does not exists')
-        images = [None] * num_frames
-        return images
-
-    video_id = Path(vid_path).stem
-    for i in range(num_frames):
-        current_image_path = op.join(data_path,
-                                     f'{video_id}_frame{(i+1):04d}.jpg')
-        if not op.exists(current_image_path):
-            print(f'{current_image_path} does not exists')
-            if previous_image_path:
-                current_image_path = previous_image_path
-            else:
-                print(f'The first image for {video_id} does not exists')
-                images = [None] * num_frames
-                return images
-        images.append(current_image_path)
-        previous_image_path = current_image_path
-    return images
-
-
-def process_video_chunk(item, image_size=56, num_frames=32):
-    # line_items = []
-    # for item in items:
-    _, vid_path = item
-    images = prepare_single_video_frames(vid_path, num_frames)
-    image_binaries, image_shape = get_image_binaries(images, image_size)
-
-    resolved_data_vid_id, vid_path = item
-    line_item = [
-        str(resolved_data_vid_id),
-        json.dumps({
-            "class": -1,
-            "width": image_shape[0],
-            "height": image_shape[1]
-        })
-    ]
-    line_item += image_binaries
-    return line_item
-    #     line_items.append(line_item)
-    # return line_items
-
-
-def main(args):
-    output_folder = f"datasets/frame_tsv"
-    os.makedirs(output_folder, exist_ok=True)
-    # To generate a tsv file:
-    # data_path: path to raw video files
-    global data_path
-    if args.dataset == "MSRVTT":
-        data_path = f"datasets/MSRVTT-v2/{args.num_frames}frames/"
-    else:
-        data_path = f"datasets/{args.num_frames}frames/"
-
-    data = load_tsv_to_mem(f'datasets/metadata/{args.split}.img.tsv')
-
-    from functools import partial
-    worker = partial(process_video_chunk,
-                     image_size=args.image_size,
-                     num_frames=args.num_frames)
-
-    def gen_rows():
-        with mp.Pool(args.num_workers) as pool, tqdm(total=len(data)) as pbar:
-            for _, line_item in enumerate(pool.imap(worker, data,
-                                                    chunksize=8)):
-                pbar.update(1)
-                yield (line_item)
-
-    resolved_visual_file = f"{output_folder}/{args.split}_{args.num_frames}frames.img.tsv"
-    print("generating visual file for", resolved_visual_file)
-    tsv_writer(gen_rows(), resolved_visual_file)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset",
-                        help="MSRVTT-v2/VATEX",
-                        type=str,
-                        default="MSRVTT-v2")
-    parser.add_argument("--split",
-                        help="train/val/test",
-                        type=str,
-                        default="test")
-    parser.add_argument("--image_size",
-                        help="256/128/56",
-                        type=int,
-                        default=256)
-    parser.add_argument("--num_frames", help="32/128", type=int, default=32)
-    parser.add_argument("--num_workers", help="40", type=int, default=16)
-    args = parser.parse_args()
-    main(args)
diff --git a/AVLFormer/data_prepro/extract_frames.py b/AVLFormer/data_prepro/extract_frames.py
deleted file mode 100644
index e3a48c9..0000000
--- a/AVLFormer/data_prepro/extract_frames.py
+++ /dev/null
@@ -1,234 +0,0 @@
-import datetime
-import multiprocessing as mp
-import os
-from os.path import join
-import subprocess
-
-from tqdm import tqdm
-
-
-def get_video_fps(video_file):
-    result = subprocess.run([
-        "/opt/homebrew/bin/ffprobe", "-v", "error", "-select_streams", "v", "-of",
-        "default=noprint_wrappers=1:nokey=1", "-show_entries",
-        "stream=r_frame_rate", video_file
-    ],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT)
-
-    result_string = result.stdout.decode('utf-8').split()[0].split('/')
-    return float(result_string[0]) / float(result_string[1])
-
-
-def get_video_duration(video_file):
-    result = subprocess.run([
-        "/opt/homebrew/bin/ffprobe", "-v", "error", "-show_entries", "format=duration",
-        "-of", "default=noprint_wrappers=1:nokey=1", video_file
-    ],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT)
-    return float(result.stdout)
-
-
-def extract_frame_from_video(video_path,
-                             save_frame_path,
-                             fps=1,
-                             num_frames=-1,
-                             start_ts=-1,
-                             end_ts=-1,
-                             suppress_msg=False,
-                             other_args="",
-                             overwrite=True):
-    """Uniformly split a video into clips of length {clip_len}.
-    i.e., in the case of clip_len=60, the clips will be 00:00:00-00:01:00, 00:01:00-00:02:00, etc, ...
-
-    Note that we drop the first (usually opening remark, etc) and last (ask for subscription, etc) clip.
-
-    Args:
-        video_path:
-        save_frame_path:
-        fps: frame_per_second, default 1
-        suppress_msg:
-        other_args: str, other /opt/homebrew/bin/ffmpeg args, such as re-scale to 720p with '-vf scale=-1:720'
-
-    Returns:
-
-    """
-    extra_args = " -hide_banner -loglevel panic " if suppress_msg else ""
-    extra_args += " -y " if overwrite else ""
-    if start_ts != -1 and end_ts != -1:
-        start_ts_str = str(datetime.timedelta(seconds=start_ts))
-        end_ts_str = str(datetime.timedelta(seconds=end_ts))
-        duration = str(datetime.timedelta(seconds=(end_ts - start_ts)))
-        # print(start_ts, end_ts, duration)
-        extra_args += f"-ss {start_ts_str} -t {duration} "
-    # extra_args2 = " -vf scale=720:-2 "
-    # -preset veryfast:  (upgrade to latest /opt/homebrew/bin/ffmpeg if error)
-    # https://superuser.com/questions/490683/cheat-sheets-and-presets-settings-that-actually-work-with-/opt/homebrew/bin/ffmpeg-1-0
-    if num_frames <= 0:
-        split_cmd_template = "/opt/homebrew/bin/ffmpeg {extra} -i {video_path} -vf fps={fps} {output_frame_path}%06d.jpg"
-
-        cur_split_cmd = split_cmd_template.format(
-            extra=extra_args,
-            video_path=video_path,
-            fps=fps,
-            output_frame_path=save_frame_path)
-    else:
-        fps = get_video_fps(video_path)
-        # get duration of the video
-        if start_ts != -1 and end_ts != -1:
-            duration = end_ts - start_ts
-        else:
-            duration = get_video_duration(video_path)
-        if duration <= 0:
-            duration = 10
-            print('Duration ERROR!', video_path)
-        frame_rate = num_frames / duration
-        # if not suppress_msg:
-        #     print(duration, frame_rate, num_frames)
-        output_exists = True
-        for frame_idx in range(num_frames):
-            if not os.path.exists(f"{save_frame_path}{(frame_idx+1):04d}.jpg"):
-                # print(f"{save_frame_path}{(frame_idx+1):04d}.jpg does not exist")
-                output_exists = False
-                save_frame_path = save_frame_path.replace(
-                    f"{num_frames}frames_test_value",
-                    f"{num_frames}frames_test_value_debug")
-                break
-        if output_exists:
-            return
-        split_cmd_template = "/opt/homebrew/bin/ffmpeg {extra} -i {video_path} -vf fps={frame_rate} {output_frame_path}%04d.jpg"
-
-        cur_split_cmd = split_cmd_template.format(
-            extra=extra_args,
-            video_path=video_path,
-            frame_rate=frame_rate,
-            output_frame_path=save_frame_path)
-        if not suppress_msg:
-            print(cur_split_cmd)
-    try:
-        _ = subprocess.run(cur_split_cmd.split(), stdout=subprocess.PIPE)
-    except Exception as e:
-        print(f"Error returned by /opt/homebrew/bin/ffmpeg cmd {e}")
-
-
-COMMON_VIDEO_ETX = set([
-    ".webm", ".mpg", ".mpeg", ".mpv", ".ogg", ".mp4", ".m4p", ".mpv", ".avi",
-    ".wmv", ".qt", ".mov", ".flv", ".swf"
-])
-
-
-def extract_frame(video_file_path,
-                  save_dir,
-                  fps,
-                  num_frames,
-                  debug=False,
-                  corrupt_files=[]):
-    filename = os.path.basename(video_file_path)
-    vid, _ = os.path.splitext(filename)
-    frame_name = f"{vid}_frame"
-    frame_save_path = join(save_dir, frame_name)
-
-    if (video_file_path not in corrupt_files and len(corrupt_files)):
-        # print(f"skipping {video_file_path}")
-        return
-    if len(corrupt_files):
-        print(f"exracting frames for {video_file_path}")
-    launch_extract = True
-    if launch_extract:
-        os.makedirs(save_dir, exist_ok=True)
-        # scale=width:height
-        extract_frame_from_video(video_file_path,
-                                 frame_save_path,
-                                 fps=fps,
-                                 num_frames=num_frames,
-                                 suppress_msg=not debug,
-                                 other_args="")
-
-
-def load_tsv_to_mem(tsv_file, sep='\t'):
-    data = []
-    with open(tsv_file, 'r') as fp:
-        for _, line in enumerate(fp):
-            data.append([x.strip() for x in line.split(sep)])
-    return data
-
-
-def extract_all_frames(video_root_dir,
-                       save_dir,
-                       fps,
-                       num_frames,
-                       video_info_tsv,
-                       corrupt_files,
-                       num_workers,
-                       debug=False):
-
-    raw_video_info = load_tsv_to_mem(video_info_tsv)
-    videoFiles = []
-    for _, line_item in enumerate(raw_video_info):
-        input_file = line_item[0]
-        # input_file = input_file.replace('datasets','_datasets')
-        if os.path.isfile(input_file):
-            videoFiles.append(input_file)
-    if debug:
-        videoFiles = videoFiles[:1]
-
-    if num_workers > 0:
-        from functools import partial
-        extract_frame_partial = partial(extract_frame,
-                                        fps=fps,
-                                        save_dir=save_dir,
-                                        debug=debug,
-                                        corrupt_files=corrupt_files,
-                                        num_frames=num_frames)
-
-        with mp.Pool(num_workers) as pool, tqdm(total=len(videoFiles)) as pbar:
-            for idx, _ in enumerate(
-                    pool.imap_unordered(extract_frame_partial,
-                                        videoFiles,
-                                        chunksize=8)):
-                pbar.update(1)
-    else:
-        for idx, d in tqdm(enumerate(videoFiles),
-                           total=len(videoFiles),
-                           desc="extracting frames from video"):
-            extract_frame(d, save_dir, fps=fps, debug=debug)
-            if debug and idx >= 10:
-                break
-
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--video_root_dir", type=str, help="video root dir")
-    parser.add_argument("--save_dir", type=str, help="save frame dir ")
-    parser.add_argument("--fps", type=str, default="1")
-    parser.add_argument("--num_frames", type=int, default=-1)
-    parser.add_argument("--num_workers", type=int, default=16)
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--corrupt_file_path",
-                        type=str,
-                        default="",
-                        help="dir saving output videos")
-    parser.add_argument("--video_info_tsv",
-                        type=str,
-                        default="",
-                        help="tsv saving all video path")
-    args = parser.parse_args()
-    args.save_dir = args.save_dir + str(args.num_frames) + 'frames'
-
-    corrupt_files = []
-    if os.path.exists(args.corrupt_file_path):
-        with open(args.corrupt_file_path) as f:
-            lines = f.readlines()
-            for ll in lines:
-                corrupt_files.append(ll.strip("\n"))
-
-    extract_all_frames(args.video_root_dir,
-                       args.save_dir,
-                       args.fps,
-                       args.num_frames,
-                       args.video_info_tsv,
-                       corrupt_files,
-                       num_workers=args.num_workers,
-                       debug=args.debug)
diff --git a/AVLFormer/data_prepro/run.sh b/AVLFormer/data_prepro/run.sh
deleted file mode 100644
index a02a17e..0000000
--- a/AVLFormer/data_prepro/run.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-echo 'TRAIN'
-
-python ./data_prepro/extract_frames.py \
---video_root_dir datasets/videos \
---save_dir datasets/ \
---video_info_tsv datasets/metadata/train.img.tsv \
---num_frames 32 \
-
-python ./data_prepro/create_image_frame_tsv.py \
---dataset FAVD \
---split train \
---image_size 256 \
---num_frames 32 \
-
-
-rm -rf datasets/32frames
-echo 'VAL'
-
-python ./data_prepro/extract_frames.py \
---video_root_dir datasets/videos \
---save_dir datasets/ \
---video_info_tsv datasets/metadata/val.img.tsv \
---num_frames 32 \
-
-python ./data_prepro/create_image_frame_tsv.py \
---dataset FAVD \
---split val \
---image_size 256 \
---num_frames 32 \
-
-
-rm -rf datasets/32frames
-echo 'TEST'
-
-python ./data_prepro/extract_frames.py \
---video_root_dir datasets/videos \
---save_dir datasets/ \
---video_info_tsv datasets/metadata/test.img.tsv \
---num_frames 32 \
-
-python ./data_prepro/create_image_frame_tsv.py \
---dataset FAVD \
---split test \
---image_size 256 \
---num_frames 32 \
-
-rm -rf datasets/32frames
\ No newline at end of file
diff --git a/AVLFormer/datasets/.gitkeep b/AVLFormer/datasets/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/AVLFormer/inference.sh b/AVLFormer/inference.sh
deleted file mode 100644
index 44ec1f8..0000000
--- a/AVLFormer/inference.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-python src/tasks/inference.py \ 
-    --eval_model_dir output/favd_default \
-    --test_video_fname datasets/videos/test \
-    --test_audio_fname datasets/mp3_audio/test \
-    --do_lower_case \
-    --do_test \
diff --git a/AVLFormer/output/.gitkeep b/AVLFormer/output/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/AVLFormer/run.sh b/AVLFormer/run.sh
deleted file mode 100644
index 087f290..0000000
--- a/AVLFormer/run.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-python ./src/tasks/train.py \ 
-    --config ./src/configs/favd_32frm_default.json \
-    --per_gpu_train_batch_size 3 \
-    --per_gpu_eval_batch_size 2 \
-    --num_train_epochs 150 \
-    --learning_rate 0.0003 \
-    --max_num_frames 32 \
-    --backbone_coef_lr 0.05 \
-    --learn_mask_enabled \
-    --loss_sparse_w 0.5 \
-    --lambda_ 0.1 \
-    --output_dir ./output/favd_default \
diff --git a/AVLFormer/src/configs/config.py b/AVLFormer/src/configs/config.py
deleted file mode 100644
index d6d6a6f..0000000
--- a/AVLFormer/src/configs/config.py
+++ /dev/null
@@ -1,704 +0,0 @@
-"""
-Modified from ClipBERT code
-"""
-import argparse
-import json
-import os
-from os import path as op
-import sys
-
-from easydict import EasyDict as edict
-from src.utils.logger import LOGGER
-from src.utils.miscellaneous import check_yaml_file, str_to_bool
-import torch
-
-
-def parse_with_config(parsed_args):
-    """This function will set args based on the input config file.
-    (1) it only overwrites unset parameters,
-        i.e., these parameters not set from user command line input
-    (2) it also sets configs in the config file but declared in the parser
-    """
-    # convert to EasyDict object, enabling access from attributes even for nested config
-    # e.g., args.train_datasets[0].name
-    args = edict(vars(parsed_args))
-    if args.config is not None:
-        config_args = json.load(open(args.config))
-        override_keys = {
-            arg[2:].split("=")[0]
-            for arg in sys.argv[1:] if arg.startswith("--")
-        }
-        for k, v in config_args.items():
-            if k not in override_keys:
-                setattr(args, k, v)
-    del args.config
-    return args
-
-
-class SharedConfigs(object):
-    """Shared options for pre-training and downstream tasks.
-    For each downstream task, implement a get_*_args function,
-    see `get_pretraining_args()`
-
-    Usage:
-    >>> shared_configs = SharedConfigs()
-    >>> pretraining_config = shared_configs.get_pretraining_args()
-    """
-
-    def __init__(self, desc="shared config"):
-        parser = argparse.ArgumentParser(description=desc)
-        # path configs
-        parser.add_argument(
-            "--data_dir",
-            default='datasets',
-            type=str,
-            required=False,
-            help="Directory with all datasets, each in one subfolder")
-        parser.add_argument(
-            "--output_dir",
-            default='output/',
-            type=str,
-            required=False,
-            help="The output directory to save checkpoint and test results.")
-        parser.add_argument("--train_yaml",
-                            default='coco_caption/train.yaml',
-                            type=str,
-                            required=False,
-                            help="Yaml file with all data for training.")
-
-        # multimodal transformer modeling config
-        parser.add_argument("--model_name_or_path",
-                            default=None,
-                            type=str,
-                            required=False,
-                            help="Path to pre-trained model or model type.")
-        parser.add_argument(
-            "--config_name",
-            default="",
-            type=str,
-            help="Pretrained config name or path if not the same as model_name."
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default="",
-            type=str,
-            help=
-            "Pretrained tokenizer name or path if not the same as model_name.")
-        parser.add_argument("--num_hidden_layers",
-                            default=-1,
-                            type=int,
-                            required=False,
-                            help="Update model config if given")
-        parser.add_argument("--hidden_size",
-                            default=-1,
-                            type=int,
-                            required=False,
-                            help="Update model config if given")
-        parser.add_argument(
-            "--num_attention_heads",
-            default=-1,
-            type=int,
-            required=False,
-            help="Update model config if given. Note that the division of "
-            "hidden_size / num_attention_heads should be in integer.")
-        parser.add_argument("--intermediate_size",
-                            default=-1,
-                            type=int,
-                            required=False,
-                            help="Update model config if given.")
-        parser.add_argument(
-            "--img_feature_dim",
-            default=512,
-            type=int,
-            help="Update model config if given.The Image Feature Dimension.")
-        parser.add_argument(
-            "--load_partial_weights",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help=
-            "Only valid when change num_hidden_layers, img_feature_dim, but not other structures. "
-            "If set to true, will load the first few layers weight from pretrained model."
-        )
-        parser.add_argument("--freeze_embedding",
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False,
-                            help="Whether to freeze word embeddings in Bert")
-        parser.add_argument("--drop_out",
-                            default=0.1,
-                            type=float,
-                            help="Drop out ratio in BERT.")
-
-        # inputs to multimodal transformer config
-        parser.add_argument(
-            "--max_seq_length",
-            default=70,
-            type=int,
-            help="The maximum total input sequence length after tokenization.")
-        parser.add_argument("--max_seq_a_length",
-                            default=40,
-                            type=int,
-                            help="The maximum sequence length for caption.")
-        parser.add_argument(
-            "--max_img_seq_length",
-            default=50,
-            type=int,
-            help="The maximum total input image sequence length.")
-        parser.add_argument(
-            "--do_lower_case",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Set this flag if you are using an uncased model.")
-        parser.add_argument(
-            "--add_od_labels",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Whether to add object detection labels or not")
-        parser.add_argument("--od_label_conf",
-                            default=0.0,
-                            type=float,
-                            help="Confidence threshold to select od labels.")
-        parser.add_argument(
-            "--use_asr",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Whether to add ASR/transcript as additional modality input")
-        parser.add_argument("--unique_labels_on",
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False,
-                            help="Use unique labels only.")
-        parser.add_argument(
-            "--no_sort_by_conf",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="By default, we will sort feature/labels by confidence, "
-            "which is helpful when truncate the feature/labels.")
-        #======= mask token
-        parser.add_argument(
-            "--mask_prob",
-            default=0.15,
-            type=float,
-            help="Probability to mask input sentence during training.")
-        parser.add_argument(
-            "--max_masked_tokens",
-            type=int,
-            default=3,
-            help="The max number of masked tokens per sentence.")
-        parser.add_argument(
-            "--attn_mask_type",
-            type=str,
-            default='seq2seq',
-            choices=['seq2seq', 'bidirectional', 'learn_vid_mask'],
-            help="Attention mask type, support seq2seq, bidirectional")
-        parser.add_argument(
-            "--text_mask_type",
-            type=str,
-            default='random',
-            choices=['random', 'pos_tag', 'bert_attn', 'attn_on_the_fly'],
-            help=
-            "Attention mask type, support random, pos_tag, bert_attn (precomputed_bert_attn), attn_on_the_fly"
-        )
-        parser.add_argument(
-            "--tag_to_mask",
-            default=["noun", "verb"],
-            type=str,
-            nargs="+",
-            choices=["noun", "verb", "adjective", "adverb", "number"],
-            help="what tags to mask")
-        parser.add_argument(
-            "--mask_tag_prob",
-            default=0.8,
-            type=float,
-            help=
-            "Probability to mask input text tokens with included tags during training."
-        )
-        parser.add_argument(
-            "--tagger_model_path",
-            type=str,
-            default='models/flair/en-pos-ontonotes-fast-v0.5.pt',
-            help="checkpoint path to tagger model")
-        parser.add_argument(
-            "--random_mask_prob",
-            default=0,
-            type=float,
-            help=
-            "Probability to mask input text tokens randomly when using other text_mask_type"
-        )
-
-        # data loading
-        parser.add_argument(
-            "--on_memory",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Option to load labels/caption to memory before training.")
-        parser.add_argument("--effective_batch_size",
-                            default=-1,
-                            type=int,
-                            help="Batch size over all GPUs for training.")
-        parser.add_argument("--per_gpu_train_batch_size",
-                            default=64,
-                            type=int,
-                            help="Batch size per GPU/CPU for training.")
-        parser.add_argument("--num_workers",
-                            default=4,
-                            type=int,
-                            help="Workers in dataloader.")
-        parser.add_argument(
-            '--limited_samples',
-            type=int,
-            default=-1,
-            help=
-            "Set # of samples per node. Data partition for cross-node training."
-        )
-
-        # training configs
-        parser.add_argument("--learning_rate",
-                            default=3e-5,
-                            type=float,
-                            help="The initial lr.")
-        parser.add_argument("--weight_decay",
-                            default=0.05,
-                            type=float,
-                            help="Weight deay.")
-        parser.add_argument("--adam_epsilon",
-                            default=1e-8,
-                            type=float,
-                            help="Epsilon for Adam.")
-        parser.add_argument("--max_grad_norm",
-                            default=1.0,
-                            type=float,
-                            help="Max gradient norm.")
-        parser.add_argument("--warmup_ratio",
-                            default=0.1,
-                            type=float,
-                            help="Linear warmup.")
-        parser.add_argument("--scheduler",
-                            default='warmup_linear',
-                            type=str,
-                            help="warmup_linear (triangle) or step",
-                            choices=["warmup_linear", "step"])
-        parser.add_argument('--gradient_accumulation_steps',
-                            type=int,
-                            default=1)
-        parser.add_argument("--num_train_epochs",
-                            default=20,
-                            type=int,
-                            help="Total number of training epochs to perform.")
-        parser.add_argument('--logging_steps',
-                            type=int,
-                            default=20,
-                            help="Log every X steps.")
-        parser.add_argument(
-            '--save_steps',
-            type=int,
-            default=2000,
-            help="Save checkpoint every X steps. Will also perform evaluatin.")
-        parser.add_argument('--restore_ratio',
-                            type=float,
-                            default=0.05,
-                            help="save restorer checkpoint for 0.05 ratio")
-        parser.add_argument("--device",
-                            type=str,
-                            default='cuda',
-                            help="cuda or cpu")
-        parser.add_argument('--seed',
-                            type=int,
-                            default=88,
-                            help="random seed for initialization.")
-        parser.add_argument("--local_rank",
-                            type=int,
-                            default=0,
-                            help="For distributed training.")
-        # ========= mix-precision training (>torch1.6 only)
-        parser.add_argument(
-            '--mixed_precision_method',
-            default='apex',
-            type=str,
-            help=
-            "set mixed_precision_method, options: apex, deepspeed, fairscale",
-            choices=["apex", "deepspeed", "fairscale"])
-        parser.add_argument('--zero_opt_stage',
-                            type=int,
-                            help="zero_opt_stage, only allowed in deepspeed",
-                            default=-1,
-                            choices=[0, 1, 2, 3])
-        parser.add_argument(
-            '--amp_opt_level',
-            default=0,
-            help="amp optimization level, can set for both deepspeed and apex",
-            type=int,
-            choices=[0, 1, 2, 3])
-        parser.add_argument('--deepspeed_fp16',
-                            help="use fp16 for deepspeed",
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False)
-        parser.add_argument('--fairscale_fp16',
-                            help="use fp16 for fairscale",
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False)
-        # ========= resume training or load pre_trained weights
-        parser.add_argument('--pretrained_checkpoint', type=str, default='')
-
-        # for debug purpose
-        parser.add_argument('--debug',
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False)
-        parser.add_argument('--debug_speed',
-                            type=str_to_bool,
-                            nargs='?',
-                            const=True,
-                            default=False)
-
-        # can use config files, will only overwrite unset parameters
-        parser.add_argument("--config", help="JSON config files")
-        self.parser = parser
-
-    def parse_args(self):
-        parsed_args = self.parser.parse_args()
-        args = parse_with_config(parsed_args)
-        return args
-
-    def add_downstream_args(self):
-        # downstream finetuning args (not needed for pretraining)
-        self.parser.add_argument("--eval_model_dir",
-                                 type=str,
-                                 default='',
-                                 help="Model directory for evaluation.")
-
-        # training/validation/inference mode (only needed for captioning)
-        self.parser.add_argument("--val_yaml",
-                                 default='coco_caption/val.yaml',
-                                 type=str,
-                                 required=False,
-                                 help="Yaml file with all data for validation")
-        self.parser.add_argument(
-            "--test_yaml",
-            default='coco_caption/test.yaml',
-            type=str,
-            required=False,
-            nargs='+',
-            help="Yaml file with all data for testing, could be multiple files."
-        )
-
-        self.parser.add_argument("--do_train",
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help="Whether to run training.")
-        self.parser.add_argument("--do_test",
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help="Whether to run inference.")
-        self.parser.add_argument("--do_eval",
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help="Whether to run evaluation.")
-        self.parser.add_argument(
-            "--evaluate_during_training",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Run evaluation during training at each save_steps.")
-        self.parser.add_argument("--per_gpu_eval_batch_size",
-                                 default=64,
-                                 type=int,
-                                 help="Batch size per GPU/CPU for evaluation.")
-        return
-
-    def shared_video_captioning_config(self, cbs=False, scst=False):
-        self.add_downstream_args()
-        # image feature masking (only used in captioning?)
-        self.parser.add_argument('--mask_img_feat',
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help='Enable image fetuare masking')
-        self.parser.add_argument('--max_masked_img_tokens',
-                                 type=int,
-                                 default=10,
-                                 help="Maximum masked object featrues")
-
-        # basic decoding configs
-        self.parser.add_argument(
-            "--tie_weights",
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help="Whether to tie decoding weights to that of encoding")
-        self.parser.add_argument("--label_smoothing",
-                                 default=0,
-                                 type=float,
-                                 help=".")
-        self.parser.add_argument("--drop_worst_ratio",
-                                 default=0,
-                                 type=float,
-                                 help=".")
-        self.parser.add_argument("--drop_worst_after",
-                                 default=0,
-                                 type=int,
-                                 help=".")
-        self.parser.add_argument('--max_gen_length',
-                                 type=int,
-                                 default=20,
-                                 help="max length of generated sentences")
-        self.parser.add_argument('--output_hidden_states',
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help="Turn on for fast decoding")
-        self.parser.add_argument('--num_return_sequences',
-                                 type=int,
-                                 default=1,
-                                 help="repeating times per image")
-        self.parser.add_argument('--num_beams',
-                                 type=int,
-                                 default=1,
-                                 help="beam search width")
-        self.parser.add_argument(
-            '--num_keep_best',
-            type=int,
-            default=1,
-            help="number of hypotheses to keep in beam search")
-        self.parser.add_argument('--temperature',
-                                 type=float,
-                                 default=1,
-                                 help="temperature in softmax for sampling")
-        self.parser.add_argument('--top_k',
-                                 type=int,
-                                 default=0,
-                                 help="filter distribution for sampling")
-        self.parser.add_argument('--top_p',
-                                 type=float,
-                                 default=1,
-                                 help="filter distribution for sampling")
-        self.parser.add_argument('--repetition_penalty',
-                                 type=int,
-                                 default=1,
-                                 help="repetition penalty from CTRL paper "
-                                 "(https://arxiv.org/abs/1909.05858)")
-        self.parser.add_argument('--length_penalty',
-                                 type=int,
-                                 default=1,
-                                 help="beam search length penalty")
-
-        if cbs:
-            self.constraint_beam_search_args()
-        if scst:
-            self.self_critic_args()
-
-        return
-
-    def constraint_beam_search_args(self):
-
-        # for Constrained Beam Search
-        self.parser.add_argument(
-            '--use_cbs',
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help='Use constrained beam search for decoding')
-        self.parser.add_argument(
-            '--min_constraints_to_satisfy',
-            type=int,
-            default=2,
-            help="minimum number of constraints to satisfy")
-        self.parser.add_argument(
-            '--use_hypo',
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help='Store hypotheses for constrained beam search')
-        self.parser.add_argument(
-            '--decoding_constraint',
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help='When decoding enforce the constraint that the'
-            'word cannot be consecutively predicted twice in a row')
-        self.parser.add_argument(
-            '--remove_bad_endings',
-            type=str_to_bool,
-            nargs='?',
-            const=True,
-            default=False,
-            help='When decoding enforce that the tokens in bad endings,'
-            'e.g., a, the, etc cannot be predicted at the end of the sentence')
-        return
-
-    def self_critic_args(self):
-        # for self-critical sequence training
-        self.parser.add_argument('--scst',
-                                 type=str_to_bool,
-                                 nargs='?',
-                                 const=True,
-                                 default=False,
-                                 help='Self-critical sequence training')
-        self.parser.add_argument(
-            '--sc_train_sample_n',
-            type=int,
-            default=5,
-            help="number of sampled captions for sc training")
-        self.parser.add_argument('--sc_baseline_type',
-                                 type=str,
-                                 default='greedy',
-                                 help="baseline tyep of REINFORCE algorithm")
-        self.parser.add_argument(
-            '--cider_cached_tokens',
-            type=str,
-            default='coco_caption/gt/coco-train-words.p',
-            help="path to cached cPickle file used to calculate CIDEr scores")
-        return
-
-
-shared_configs = SharedConfigs()
-
-
-def basic_check_arguments(args):
-    args.output_dir = args.output_dir.replace(" ", "_")
-    if args.debug_speed:
-        args.logging_steps = 1
-        args.num_train_epochs = 1
-
-    if args.debug:
-        args.effective_batch_size = args.num_gpus
-        args.per_gpu_train_batch_size = 1
-        args.num_train_epochs = 1
-        args.logging_steps = 5
-        args.max_img_seq_length = 98
-
-    # can add some basic checks here
-    if args.mixed_precision_method != "deepspeed":
-        LOGGER.info(
-            "Deepspeed is not enabled. We will disable the relevant args --zero_opt_stage and --deepspeed_fp16."
-        )
-        args.zero_opt_stage = -1
-        args.deepspeed_fp16 = False
-
-    if args.mixed_precision_method != "fairscale":
-        LOGGER.info(
-            "Fairscale is not enabled. We will disable the relevant args --fairscale_fp16."
-        )
-        args.zero_opt_stage = -1
-        args.fairscale_fp16 = False
-
-    if args.mixed_precision_method != "apex":
-        LOGGER.info("Disable restorer for deepspeed or fairscale")
-        args.restore_ratio = -1
-
-    if args.text_mask_type != "pos_tag":
-        LOGGER.info("Disable --mask_tag_prob")
-        args.mask_tag_prob = -1
-
-    if hasattr(args, 'do_train') and args.do_train:
-        check_yaml_file(op.join(args.data_dir, args.train_yaml))
-        if args.evaluate_during_training:
-            check_yaml_file(op.join(args.data_dir, args.val_yaml))
-        # check after num_gpus initialized
-        if args.effective_batch_size > 0:
-            assert args.effective_batch_size % args.num_gpus == 0
-            args.per_gpu_train_batch_size = int(args.effective_batch_size /
-                                                args.num_gpus)
-            args.per_gpu_eval_batch_size = int(args.effective_batch_size /
-                                               args.num_gpus)
-        else:
-            assert args.per_gpu_train_batch_size > 0
-            args.effective_batch_size = args.per_gpu_train_batch_size * args.num_gpus
-            args.per_gpu_eval_batch_size = max(args.per_gpu_eval_batch_size,
-                                               args.per_gpu_train_batch_size)
-
-        if args.use_asr:
-            args.add_od_labels = True
-        if args.add_od_labels:
-            assert args.max_seq_length > args.max_seq_a_length
-        else:
-            assert args.max_seq_length == args.max_seq_a_length
-    if hasattr(args, 'do_test') and args.do_test:
-        for test_yaml in args.test_yaml:
-            check_yaml_file(op.join(args.data_dir, test_yaml))
-
-
-def restore_training_settings(args):
-    ''' Restore args for inference and SCST training
-    Only works for downstream finetuning
-    '''
-    if args.do_train:
-        if hasattr(args, 'scst') and not args.scst:
-            return args
-        checkpoint = args.model_name_or_path
-    else:
-        assert args.do_test or args.do_eval
-        checkpoint = args.eval_model_dir
-    # restore training settings, check hasattr for backward compatibility
-    try:
-        # train_args = torch.load(op.join(checkpoint, os.pardir, 'log', 'args.json')) #
-        json_path = op.join(checkpoint, os.pardir, 'log', 'args.json')
-        f = open(json_path, 'r')
-        json_data = json.load(f)
-        from easydict import EasyDict
-        train_args = EasyDict(json_data)
-    except Exception as e:
-        train_args = torch.load(op.join(checkpoint, 'training_args.bin'))
-
-    if args.add_od_labels:
-        if hasattr(train_args, 'max_seq_a_length'):
-            if hasattr(train_args, 'scst') and train_args.scst:
-                max_od_labels_len = train_args.max_seq_length - train_args.max_gen_length
-            else:
-                max_od_labels_len = train_args.max_seq_length - train_args.max_seq_a_length
-            max_seq_length = args.max_gen_length + max_od_labels_len
-            args.max_seq_length = max_seq_length
-            LOGGER.warning(
-                'Override max_seq_length to {} = max_gen_length:{} + od_labels_len:{}'
-                .format(max_seq_length, args.max_gen_length,
-                        max_od_labels_len))
-
-    override_params = [
-        'do_lower_case', 'add_od_labels', 'img_feature_dim', 'no_sort_by_conf',
-        'num_hidden_layers'
-    ]
-    for param in override_params:
-        if hasattr(train_args, param):
-            train_v = getattr(train_args, param)
-            test_v = getattr(args, param)
-            if train_v != test_v:
-                LOGGER.warning('Override {} with train args: {} -> {}'.format(
-                    param, test_v, train_v))
-                setattr(args, param, train_v)
-
-    if hasattr(args, 'scst') and args.scst == True:
-        args.max_seq_length = train_args.max_gen_length
-        args.max_seq_a_length = train_args.max_gen_length
-    return args
diff --git a/AVLFormer/src/configs/favd_32frm_default.json b/AVLFormer/src/configs/favd_32frm_default.json
deleted file mode 100644
index 99e88f2..0000000
--- a/AVLFormer/src/configs/favd_32frm_default.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
-    "do_train": true,
-    "evaluate_during_training": true,
-    "data_dir": "datasets",
-    "train_yaml": "train_32frames.yaml",
-    "val_yaml": "val_32frames.yaml",
-    "do_lower_case": true,
-    "max_seq_a_length": 300,
-    "max_seq_length": 300,
-    "max_img_seq_length": 1257,
-    "img_res": 224,
-    "max_num_frames": 32,
-    "patch_size": 32,
-    "per_gpu_eval_batch_size": 1,
-    "per_gpu_train_batch_size": 1,
-    "num_workers": 16,
-    "model_name_or_path": "captioning/bert-base-uncased/",
-    "pretrained_checkpoint": "",
-    "img_feature_dim": 512,
-    "vidswin_size": "base",
-    "kinetics": "600",
-    "use_clip_model": true,
-    "pretrained_2d": 0,
-    "grid_feat": true,
-    "mask_prob": 0.25,
-    "max_masked_tokens": 75,
-    "attn_mask_type": "seq2seq",
-    "max_gen_length": 115,
-    "on_memory": true,
-    "use_checkpoint": true,
-    "num_train_epochs": 150,
-    "learning_rate": 0.0003,
-    "backbone_coef_lr": 0.05,
-    "scheduler": "warmup_linear",
-    "warmup_ratio": 0.1,
-    "weight_decay": 0.05,
-    "max_grad_norm": 1.0,
-    "gradient_accumulation_steps": 1,
-    "mixed_precision_method": "apex",
-    "amp_opt_level": 0,
-    "deepspeed_fp16": false,
-    "fairscale_fp16": false,
-    "zero_opt_stage": 1,
-    "restore_ratio": -1,
-    "debug": false,
-    "debug_speed": false,
-    "seed": 88
-}
\ No newline at end of file
diff --git a/AVLFormer/src/datasets/caption_tensorizer.py b/AVLFormer/src/datasets/caption_tensorizer.py
deleted file mode 100755
index 1b216f7..0000000
--- a/AVLFormer/src/datasets/caption_tensorizer.py
+++ /dev/null
@@ -1,457 +0,0 @@
-import html
-import os.path as op
-import random
-import re
-
-import torch
-
-FLAIR_TAG = {
-    "noun": ["NN", "NNP", "NNPS", "NNS", "PRP", "PRP$", "WP", "WP$"],
-    "verb": ["VB", "VBD", "VBG", "VBP", "VBZ"],
-    "adjective": ["JJ", "JJR", "JJS"],
-    "adverb": ["RB", "RBR", "RBS", "WRB"],
-    "number": ["CD"]
-}
-
-
-class CaptionTensorizer(object):
-
-    def __init__(self,
-                 tokenizer,
-                 max_img_seq_length=50,
-                 max_seq_length=70,
-                 max_seq_a_length=40,
-                 mask_prob=0.15,
-                 max_masked_tokens=3,
-                 attn_mask_type='seq2seq',
-                 is_train=True,
-                 mask_b=False,
-                 text_mask_type='random',
-                 tag_to_mask=None,
-                 mask_tag_prob=0.8,
-                 random_mask_prob=0.5,
-                 tagger=None):
-        """Constructor.
-        Args:
-            tokenizer: tokenizer for text processing.
-            max_img_seq_length: max image sequence length.
-            max_seq_length: max text sequence length.
-            max_seq_a_length: max caption sequence length.
-            is_train: train or test mode.
-            mask_prob: probability to mask a input token.
-            max_masked_tokens: maximum number of tokens to be masked in one sentence.
-            attn_mask_type: attention mask type, support seq2seq/bidirectional/cap_s2s/cap_bidir.
-            mask_b: whether to mask text_b or not during training.
-        """
-        self.tokenizer = tokenizer
-        self.is_train = is_train
-        self.max_img_seq_len = max_img_seq_length
-        self.max_seq_len = max_seq_length
-        self.max_seq_a_len = max_seq_a_length
-        self.mask_prob = mask_prob
-        self.max_masked_tokens = max_masked_tokens
-        self.attn_mask_type = attn_mask_type
-        self.text_mask_type = text_mask_type
-        self.mask_b = mask_b
-        self.tag_to_mask = None
-        self.mask_tag_prob = 0
-        self.random_mask_prob = 1
-        self.tagger = tagger
-        if is_train:
-            assert attn_mask_type in ('seq2seq', 'bidirectional', 'cap_s2s',
-                                      'cap_bidir', 'learn_vid_att')
-            assert text_mask_type in ('random', 'bert_attn', 'pos_tag',
-                                      'attn_on_the_fly')
-            if self.text_mask_type == 'pos_tag':
-                self.tag_to_mask = tag_to_mask
-                self.included_tags = set()
-                for type in self.tag_to_mask:
-                    self.included_tags.update(set(FLAIR_TAG[type]))
-                self.mask_tag_prob = mask_tag_prob
-            if self.text_mask_type != "random":
-                self.random_mask_prob = random_mask_prob
-        else:
-            assert attn_mask_type in ('seq2seq', 'learn_vid_att')
-
-        self._triangle_mask = torch.tril(
-            torch.ones((self.max_seq_len, self.max_seq_len), dtype=torch.long))
-
-    def get_pos_tag_mask_idx(self, seq_a_len, text_meta):
-        ''' The rest   
-        ADD	Email
-        AFX	Affix
-        CC	Coordinating conjunction
-        DT	Determiner
-        EX	Existential there
-        FW	Foreign word
-        HYPH	Hyphen
-        IN	Preposition or subordinating conjunction
-        LS	List item marker
-        MD	Modal
-        NFP	Superfluous punctuation
-        PDT	Predeterminer
-        POS	Possessive ending
-        RP	Particle
-        SYM	Symbol
-        TO	to
-        UH	Interjection
-        WDT	Wh-determiner
-        XX
-        '''
-        # process loaded pos_tags
-        pos_tags = text_meta["bert_pos_tag"]
-        if len(pos_tags) > seq_a_len - 2:
-            pos_tags = pos_tags[:seq_a_len - 2]
-        pos_tags = [None] + pos_tags + [None]
-        padding_len = seq_a_len - len(pos_tags)
-        pos_tags += [None] * padding_len
-        allow_masked_ids = set()
-        for bert_idx, tag in enumerate(pos_tags):
-            if tag is None:
-                continue
-            if bert_idx >= seq_a_len:
-                break
-            if tag not in self.included_tags:
-                continue
-            allow_masked_ids.add(bert_idx)
-        return pos_tags, allow_masked_ids
-
-    def get_bert_attn_mask_idx(self, seq_a_len, text_meta, num_masked):
-        # process loaded bert attention weights (assuming max_len = 50)
-        attn_weights = text_meta["bert_attn"]
-        if len(attn_weights) > seq_a_len:
-            attn_weights = attn_weights[:seq_a_len]
-        elif len(attn_weights) < seq_a_len:
-            # pad with zeros
-            padding_len = seq_a_len - len(attn_weights)
-            attn_weights = [0.0] * padding_len
-        mask_idx = torch.multinomial(torch.tensor(attn_weights),
-                                     num_masked).tolist()
-        return mask_idx
-
-    def get_attn_masks(self, seq_a_len, seq_len, mode='default'):
-        # image features
-        img_len = self.max_img_seq_len
-
-        max_len = self.max_seq_len + self.max_img_seq_len
-        # C: caption, L: label, R: image region
-        c_start, c_end = 0, seq_a_len
-        l_start, l_end = self.max_seq_a_len, seq_len
-        r_start, r_end = self.max_seq_len, self.max_seq_len + img_len
-
-        if self.attn_mask_type in ('learn_vid_att'):
-
-            if mode == 'default':
-                # prepare attention mask:
-                # note that there is no attention from caption to image
-                # because otherwise it will violate the triangle attention
-                # for caption as caption will have full attention on image.
-                attention_mask = torch.zeros((max_len, max_len),
-                                             dtype=torch.long)
-                # triangle mask for caption to caption
-                attention_mask[c_start:c_end, c_start:c_end].copy_(
-                    self._triangle_mask[0:seq_a_len, 0:seq_a_len])
-                # full attention for C-L, C-R
-                attention_mask[c_start:c_end, l_start:l_end] = 1
-                attention_mask[c_start:c_end, r_start:r_end] = 1
-                # full attention for video tokens:
-                attention_mask[l_start:r_end, l_start:r_end] = 1
-            elif mode == 'full':
-                attention_mask = torch.zeros((max_len, max_len),
-                                             dtype=torch.long)
-                # triangle mask for caption to caption
-                attention_mask[c_start:c_end, c_start:c_end].copy_(
-                    self._triangle_mask[0:seq_a_len, 0:seq_a_len])
-                # full attention for C-L, C-R
-                attention_mask[c_start:c_end, l_start:l_end] = 1
-                attention_mask[c_start:c_end, r_start:r_end] = 1
-                # full attention for video tokens:
-                attention_mask[l_start:r_end, l_start:r_end] = 1
-                # additionla attention for video tokens:
-                attention_mask[l_start:l_end, c_start:c_end] = 1
-                attention_mask[r_start:r_end, c_start:c_end] = 1
-            elif mode == 'sdir':
-                attention_mask = torch.zeros((max_len, max_len),
-                                             dtype=torch.long)
-                # triangle mask for caption to caption
-                attention_mask[c_start:c_end, c_start:c_end].copy_(
-                    self._triangle_mask[0:seq_a_len, 0:seq_a_len])
-                # full attention for C-L, C-R
-                attention_mask[c_start:c_end, l_start:l_end] = 1
-                attention_mask[c_start:c_end, r_start:r_end] = 1
-                # full attention for video tokens:
-                tri_mask = torch.tril(
-                    torch.ones((img_len, img_len), dtype=torch.long))
-                attention_mask[l_start:r_end, l_start:r_end].copy_(tri_mask)
-
-        return attention_mask
-
-    def get_text_mask_idx(self, seq_a_len, seq_len, text_meta=None):
-        # randomly mask words for prediction, ignore [CLS], [PAD]
-        # it is important to mask [SEP] for image captioning as it means [EOS].
-
-        # 1. get the number of masked tokens
-        if self.mask_b:
-            # can mask both text_a and text_b
-            num_masked = min(max(round(self.mask_prob * seq_len), 1),
-                             self.max_masked_tokens)
-        else:
-            # only mask text_a
-            num_masked = min(max(round(self.mask_prob * seq_a_len), 1),
-                             self.max_masked_tokens)
-        num_masked = int(num_masked)
-
-        # 2. get the masking candidates
-        if self.mask_b:
-            # text b always random masking
-            text_b_candidate = list(range(self.max_seq_a_len, seq_len))
-        else:
-            text_b_candidate = []
-        if self.text_mask_type == 'pos_tag' and random.random(
-        ) > self.random_mask_prob:
-            full_candidate = set(list(range(1, seq_a_len)))
-            pos_tags, pos_tag_candidate = self.get_pos_tag_mask_idx(
-                text_meta=text_meta, seq_a_len=seq_a_len)
-
-            left_over_candidate = list(
-                full_candidate.difference(
-                    pos_tag_candidate)) + text_b_candidate
-            pos_tag_candidate = list(pos_tag_candidate)
-            num_pos_tag_masked = min(
-                max(1, int(num_masked * self.mask_tag_prob)),
-                len(pos_tag_candidate))
-            random.shuffle(pos_tag_candidate)
-            masked_idx = pos_tag_candidate[:num_pos_tag_masked]
-
-            num_left_overs = num_masked - num_pos_tag_masked
-            # add mask to achieve mask prob
-            if num_left_overs > 0:
-                random.shuffle(left_over_candidate)
-                other_masked_idx = left_over_candidate[:num_left_overs]
-                masked_idx += other_masked_idx
-        elif self.text_mask_type == 'bert_attn' and random.random(
-        ) > self.random_mask_prob:
-            masked_idx = self.get_bert_attn_mask_idx(seq_a_len, text_meta,
-                                                     num_masked)
-        else:
-            # random
-            candidate_masked_idx = list(range(1, seq_a_len))
-            candidate_masked_idx += text_b_candidate
-            random.shuffle(candidate_masked_idx)
-            masked_idx = candidate_masked_idx[:num_masked]
-        masked_idx = sorted(masked_idx)
-        return masked_idx
-
-    def mask_text_inputs(self, tokens, seq_a_len, seq_len, text_meta=None):
-        if self.is_train:
-            # text_mask_type = 'random'
-            if self.text_mask_type == "attn_on_the_fly" and random.random(
-            ) > self.random_mask_prob and len(tokens) > 2:
-                # self.text_mask_type == "attn_on_the_fly"
-                masked_pos = torch.zeros(self.max_seq_len, dtype=torch.int)
-                masked_pos[1:seq_a_len] += 1
-                masked_pos[0] = self.tokenizer.convert_tokens_to_ids(
-                    [self.tokenizer.mask_token])[0]
-                mlm_targets = [-1] * self.max_masked_tokens
-            else:
-                masked_idx = self.get_text_mask_idx(seq_a_len, seq_len,
-                                                    text_meta)
-                try:
-                    masked_token = [tokens[i] for i in masked_idx]
-                except Exception as e:
-                    overflow_idx = []
-                    for i in masked_idx:
-                        if i >= len(tokens) or i < 0:
-                            overflow_idx.append(i)
-                    raise ValueError(
-                        f"Error {e}\nOverflow: {overflow_idx} in tokens {tokens}"
-                    )
-                for pos in masked_idx:
-                    if random.random() <= 0.8:
-                        # 80% chance to be a ['MASK'] token
-                        tokens[pos] = self.tokenizer.mask_token
-                    elif random.random() <= 0.5:
-                        # 10% chance to be a random word ((1-0.8)*0.5)
-                        tokens[pos] = self.tokenizer.get_random_token()
-                    else:
-                        # 10% chance to remain the same (1-0.8-0.1)
-                        pass
-
-                masked_pos = torch.zeros(self.max_seq_len, dtype=torch.int)
-                masked_pos[masked_idx] = 1
-
-                # get the actual number of masked tokens
-                num_masked = len(masked_token)
-                mlm_targets = self.tokenizer.convert_tokens_to_ids(
-                    masked_token)
-                if num_masked < self.max_masked_tokens:
-                    mlm_targets = mlm_targets + (
-                        [-1] * (self.max_masked_tokens - num_masked))
-                assert len(
-                    mlm_targets
-                ) == self.max_masked_tokens, f"mismatch in len(masked_ids) {len(mlm_targets)} vs. max_masked_tokens {self.max_masked_tokens}"
-        elif not self.is_train:
-            masked_pos = torch.ones(self.max_seq_len, dtype=torch.int)
-            mlm_targets = None
-
-        return tokens, masked_pos, mlm_targets
-
-    def prepro_raw_txt(self, text):
-        # in case there are html special characters
-        text = html.unescape(text)
-        # FIXME: quick hack for text with emoji, may adopt twitter tokenizer later
-        emoji_pattern = re.compile(
-            "["
-            u"\U0001F600-\U0001F64F"  # emoticons
-            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
-            u"\U0001F680-\U0001F6FF"  # transport & map symbols
-            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
-            "]+",
-            flags=re.UNICODE)
-        text = emoji_pattern.sub(r'', text)
-        return text
-
-    def tokenize_text_inputs(self,
-                             text_a,
-                             text_b=None,
-                             cls_token_segment_id=0,
-                             pad_token_segment_id=0,
-                             sequence_a_segment_id=0,
-                             sequence_b_segment_id=1,
-                             text_meta=None):
-        text_a = self.prepro_raw_txt(text_a)
-        if self.is_train:
-            tokens_a = self.tokenizer.tokenize(text_a)
-            if self.text_mask_type == "pos_tag":
-                assert text_meta is not None and 'bert_pos_tag' in text_meta
-                assert len(text_meta['bert_pos_tag']) == len(tokens_a)
-            elif self.text_mask_type == "bert_attn":
-                assert text_meta is not None and 'bert_attn' in text_meta
-                assert (len(text_meta['bert_attn']) == len(tokens_a) + 2
-                        or len(text_meta['bert_attn']) == self.max_seq_a_len)
-        else:
-            # fake tokens to generate masks
-            tokens_a = [self.tokenizer.mask_token] * (self.max_seq_a_len - 2)
-        if len(tokens_a) > self.max_seq_a_len - 2:
-            tokens_a = tokens_a[:(self.max_seq_a_len - 2)]
-
-        tokens = [self.tokenizer.cls_token
-                  ] + tokens_a + [self.tokenizer.sep_token]
-        segment_ids = [cls_token_segment_id
-                       ] + [sequence_a_segment_id] * (len(tokens) - 1)
-        seq_a_len = len(tokens)
-        if text_b:
-            text_b = self.prepro_raw_txt(text_b)
-            # pad text_a to keep it in fixed length for better inference.
-            # we do not use pos tag for text_b
-            padding_a_len = self.max_seq_a_len - seq_a_len
-            tokens += [self.tokenizer.pad_token] * padding_a_len
-            segment_ids += ([pad_token_segment_id] * padding_a_len)
-
-            tokens_b = self.tokenizer.tokenize(text_b)
-            if len(tokens_b) > self.max_seq_len - len(tokens) - 1:
-                tokens_b = tokens_b[:(self.max_seq_len - len(tokens) - 1)]
-            tokens += tokens_b + [self.tokenizer.sep_token]
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
-
-        seq_len = len(tokens)
-        return tokens, segment_ids, seq_a_len, seq_len
-
-    def tensorize_example_e2e(self,
-                              text_a,
-                              img_feat,
-                              audio_feat,
-                              text_b=None,
-                              cls_token_segment_id=0,
-                              pad_token_segment_id=0,
-                              sequence_a_segment_id=0,
-                              sequence_b_segment_id=1,
-                              text_meta=None,
-                              mode='default'):
-        # tokenize the texts
-        tokens, segment_ids, seq_a_len, seq_len = self.tokenize_text_inputs(
-            text_a, text_b, cls_token_segment_id, pad_token_segment_id,
-            sequence_a_segment_id, sequence_b_segment_id, text_meta)
-
-        # masking the tokens
-        tokens_after_masking, masked_pos, mlm_targets = self.mask_text_inputs(
-            tokens, seq_a_len, seq_len, text_meta)
-
-        # pad on the right for image captioning
-        seq_padding_len = self.max_seq_len - seq_len
-
-        raw_tokens = tokens
-        tokens = tokens_after_masking + ([self.tokenizer.pad_token] *
-                                         seq_padding_len)
-        input_tokens = tokens_after_masking[:-1] + (
-            [self.tokenizer.pad_token] * (seq_padding_len + 1))
-        output_tokens = raw_tokens[1:] + ([self.tokenizer.pad_token] *
-                                          (seq_padding_len + 1))
-        input_token_ids = torch.tensor(
-            self.tokenizer.convert_tokens_to_ids(input_tokens),
-            dtype=torch.long)
-        output_token_ids = torch.tensor(
-            self.tokenizer.convert_tokens_to_ids(output_tokens),
-            dtype=torch.long)
-
-        segment_ids += ([pad_token_segment_id] * seq_padding_len)
-        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-
-        attention_mask = self.get_attn_masks(seq_a_len, seq_len, mode)
-
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
-        segment_ids = torch.tensor(segment_ids, dtype=torch.long)
-
-        if self.is_train:
-            mlm_targets = torch.tensor(mlm_targets, dtype=torch.long)
-            return (input_ids, attention_mask, segment_ids, img_feat,
-                    audio_feat, masked_pos, mlm_targets, input_token_ids,
-                    output_token_ids)
-        return input_ids, attention_mask, segment_ids, img_feat, audio_feat, masked_pos, input_token_ids, output_token_ids
-
-
-def build_tensorizer(args, tokenizer, is_train=True):
-    if hasattr(args, 'mask_od_labels'):
-        mask_b = args.mask_od_labels
-    else:
-        mask_b = False
-    if is_train:
-        if args.text_mask_type == "pos_tag":
-            # tagger = SequenceTagger.load(
-            #     'models/pos-english-fast/pos'
-            # )
-            # if op.exists(args.tagger_model_path):
-            #     tagger = SequenceTagger.load(args.tagger_model_path)
-            # else:
-            #     LOGGER.info(f'{args.tagger_model_path} does not exists, download on the fly...')
-            tag_to_mask = set(args.tag_to_mask)
-        # elif args.text_mask_type == "bert_attn":
-        #     bert =
-        else:
-            tagger = None
-            tag_to_mask = None
-        return CaptionTensorizer(
-            tokenizer,
-            max_img_seq_length=args.max_img_seq_length,
-            max_seq_length=args.max_seq_length,
-            max_seq_a_length=args.max_seq_a_length,
-            mask_prob=args.mask_prob,
-            max_masked_tokens=args.max_masked_tokens,
-            attn_mask_type=args.attn_mask_type,
-            is_train=True,
-            mask_b=mask_b,
-            text_mask_type=args.text_mask_type,
-            mask_tag_prob=args.mask_tag_prob,
-            tag_to_mask=tag_to_mask,
-            random_mask_prob=args.random_mask_prob,
-            tagger=None,
-        )
-    return CaptionTensorizer(
-        tokenizer,
-        max_img_seq_length=args.max_img_seq_length,
-        max_seq_length=args.max_seq_length
-        if args.add_od_labels else args.max_gen_length,
-        max_seq_a_length=args.max_gen_length,
-        is_train=False,
-        attn_mask_type=args.attn_mask_type,
-    )
diff --git a/AVLFormer/src/datasets/data_sampler.py b/AVLFormer/src/datasets/data_sampler.py
deleted file mode 100644
index 4260248..0000000
--- a/AVLFormer/src/datasets/data_sampler.py
+++ /dev/null
@@ -1,213 +0,0 @@
-from datetime import datetime
-import logging
-import math
-import random
-
-from src.utils.comm import get_local_rank, get_local_size, get_rank, get_world_size
-import torch
-import torch.distributed as dist
-from torch.utils.data import Dataset, Sampler
-
-from .sampler_utils import PrepareData
-
-logger = logging.getLogger(__name__)
-
-
-class DistributedSamplerLimited(Sampler):
-
-    def __init__(self,
-                 dataset: Dataset,
-                 num_replicas: int = None,
-                 rank: int = None,
-                 shuffle: bool = True,
-                 seed: int = 0,
-                 drop_last: bool = False,
-                 limited=-1) -> None:
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            rank = dist.get_rank()
-        if rank >= num_replicas or rank < 0:
-            raise ValueError("Invalid rank {}, rank should be in the interval"
-                             " [0, {}]".format(rank, num_replicas - 1))
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        self.drop_last = drop_last
-        print(
-            f'Dbg: distribeted sampler limited: rank={rank}, num_replicas={num_replicas}'
-        )
-        # If the dataset length is evenly divisible by # of replicas, then there
-        # is no need to drop any data, since the dataset will be split equally.
-        if self.drop_last and len(
-                self.dataset) % self.num_replicas != 0:  # type: ignore
-            # Split to nearest available length that is evenly divisible.
-            # This is to ensure each rank receives the same amount of data when
-            # using this Sampler.
-            self.num_samples = math.ceil(
-                # `type:ignore` is required because Dataset cannot provide a default __len__
-                # see NOTE in pytorch/torch/utils/data/sampler.py
-                (len(self.dataset) - self.num_replicas) /
-                self.num_replicas  # type: ignore
-            )
-        else:
-            self.num_samples = math.ceil(
-                len(self.dataset) / self.num_replicas)  # type: ignore
-        self.total_size = self.num_samples * self.num_replicas
-        self.shuffle = shuffle
-        self.seed = seed
-        self.limited = limited
-        if self.limited > -1:
-            self.num_samples = min(self.limited, self.num_samples)
-
-    def __iter__(self):
-        if self.shuffle:
-            # deterministically shuffle based on epoch and seed
-            g = torch.Generator()
-            g.manual_seed(self.seed + self.epoch)
-            indices = torch.randperm(len(self.dataset),
-                                     generator=g).tolist()  # type: ignore
-        else:
-            indices = list(range(len(self.dataset)))  # type: ignore
-
-        if not self.drop_last:
-            # add extra samples to make it evenly divisible
-            padding_size = self.total_size - len(indices)
-            if padding_size <= len(indices):
-                indices += indices[:padding_size]
-            else:
-                indices += (
-                    indices *
-                    math.ceil(padding_size / len(indices)))[:padding_size]
-        else:
-            # remove tail of data to make it evenly divisible.
-            indices = indices[:self.total_size]
-        assert len(indices) == self.total_size
-
-        # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
-        if self.limited > -1 and len(indices) > self.limited:
-            print(f'Trim indices: {len(indices)} --> {self.limited}')
-            indices = indices[:self.limited]
-        assert len(indices) == self.num_samples
-        # shuffle subsample
-        if self.shuffle:  # and self.epoch > 0:
-            # random.seed(self.seed + self.epoch)
-            random.seed(datetime.now())
-            random.shuffle(indices)
-
-        return iter(indices)
-
-    def __len__(self) -> int:
-        return self.num_samples
-
-    def set_epoch(self, epoch: int) -> None:
-        self.epoch = 0  # keep data unchanged
-        # self.epoch = epoch
-
-
-class NodeSplitSampler(Sampler):
-
-    def __init__(self,
-                 dataset,
-                 shuffle,
-                 random_seed,
-                 first_epoch_skip_shuffle=False,
-                 prepare_data=True):
-        self.dataset = dataset
-        self.shuffle = shuffle
-        self.random_seed = random_seed
-
-        self.world_size = get_world_size()
-        self.local_size = get_local_size()
-        self.node_size = self.world_size // self.local_size
-        self.rank = get_rank()
-        self.node_idx = self.rank // self.local_size
-        self.local_rank = get_local_rank()
-        self.next_epoch_skip_shuffle = first_epoch_skip_shuffle
-
-        # only be used when shuffle = True and first_epoch_skip_shuffle = True
-        self.prepare_data = prepare_data
-        self.prepare = None
-        self.skip = 0
-
-    def get_index_on_node(self):
-        # there is no need to cache source_list as we only call this function
-        # once in the whole training life-time
-        source_list = self.dataset.get_composite_source_idx()
-        idx_split = list(enumerate(source_list))
-        idx_split = torch.tensor(idx_split)
-        if self.shuffle:
-            random_idx = self.get_shufle_idx(len(idx_split))
-            idx_split = idx_split[random_idx]
-            max_split = idx_split[:, 1].max() + 1
-            priority = self.get_shufle_idx(max_split)
-            sort_idx = torch.argsort(priority[idx_split[:, 1]])
-            idx_split = idx_split[sort_idx]
-        num_idx_on_node = (len(idx_split) + self.node_size -
-                           1) // self.node_size
-        offset = num_idx_on_node * self.node_idx
-        offset_end = offset + num_idx_on_node
-        offset_end = min(offset_end, len(idx_split))
-        unique_split_index = list(set(idx_split[offset:offset_end,
-                                                1].tolist()))
-        logger.info(unique_split_index)
-        if self.shuffle and self.next_epoch_skip_shuffle and self.prepare_data:
-            if get_local_rank() == 0:
-                self.prepare = PrepareData(self.dataset,
-                                           prepare_t_versions=[],
-                                           fixed_samples_in_node=True)
-                for s in unique_split_index:
-                    self.prepare.prepare(s)
-        return idx_split[offset:offset_end, 0]
-
-    def get_shufle_idx(self, n):
-        g = torch.Generator()
-        g.manual_seed(self.random_seed)
-        random_idx = torch.randperm(n, generator=g)
-        self.random_seed += 99
-        return random_idx
-
-    def get_index_on_rank(self, idx_on_node):
-        if self.shuffle:
-            if not self.next_epoch_skip_shuffle:
-                curr_idx_on_node = idx_on_node[self.get_shufle_idx(
-                    len(idx_on_node))]
-            else:
-                curr_idx_on_node = idx_on_node
-                self.next_epoch_skip_shuffle = False
-        else:
-            curr_idx_on_node = idx_on_node
-        idx_rank_size = (len(curr_idx_on_node) + self.local_size -
-                         1) // self.local_size
-        offset = idx_rank_size * self.local_rank
-        offset_end = offset + idx_rank_size
-        offset_end = min(offset_end, len(curr_idx_on_node))
-        curr_idx_on_node = curr_idx_on_node.tolist()
-        for i in range(offset, offset_end):
-            yield curr_idx_on_node[i]
-
-    def skip(self, num):
-        self.skip = num
-
-    def __iter__(self):
-        self.curr_idx = 0
-        idx_on_node = self.get_index_on_node()
-        if self.skip > 0:
-            logging.info('we will skip {}'.format(self.skip))
-        while True:
-            for i in self.get_index_on_rank(idx_on_node):
-                if self.skip <= 0:
-                    yield i
-                else:
-                    self.skip -= 1
-
-    def __len__(self):
-        raise ValueError('should not be called')
diff --git a/AVLFormer/src/datasets/data_utils/image_ops.py b/AVLFormer/src/datasets/data_utils/image_ops.py
deleted file mode 100755
index 59471c5..0000000
--- a/AVLFormer/src/datasets/data_utils/image_ops.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import base64
-
-import cv2
-import numpy as np
-import scipy.misc
-import torch
-
-
-def img_from_base64(imagestring):
-    try:
-        jpgbytestring = base64.b64decode(imagestring)
-        nparr = np.frombuffer(jpgbytestring, np.uint8)
-        r = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-        return r
-    except ValueError:
-        return None
-
-
-def myimrotate(img,
-               angle,
-               center=None,
-               scale=1.0,
-               border_value=0,
-               auto_bound=False):
-    """Rotate an image.
-    Args:
-        img (ndarray): Image to be rotated.
-        angle (float): Rotation angle in degrees, positive values mean
-            clockwise rotation.
-        center (tuple): Center of the rotation in the source image, by default
-            it is the center of the image.
-        scale (float): Isotropic scale factor.
-        border_value (int): Border value.
-        auto_bound (bool): Whether to adjust the image size to cover the whole
-            rotated image.
-    Returns:
-        ndarray: The rotated image.
-    """
-    if center is not None and auto_bound:
-        raise ValueError('`auto_bound` conflicts with `center`')
-    h, w = img.shape[:2]
-    if center is None:
-        center = ((w - 1) * 0.5, (h - 1) * 0.5)
-    assert isinstance(center, tuple)
-
-    matrix = cv2.getRotationMatrix2D(center, angle, scale)
-    if auto_bound:
-        cos = np.abs(matrix[0, 0])
-        sin = np.abs(matrix[0, 1])
-        new_w = h * sin + w * cos
-        new_h = h * cos + w * sin
-        matrix[0, 2] += (new_w - w) * 0.5
-        matrix[1, 2] += (new_h - h) * 0.5
-        w = int(np.round(new_w))
-        h = int(np.round(new_h))
-    rotated = cv2.warpAffine(img, matrix, (w, h), borderValue=border_value)
-    return rotated
-
-
-def myimresize(img, size, return_scale=False, interpolation='bilinear'):
-    """Resize image to a given size.
-    Args:
-        img (ndarray): The input image.
-        size (tuple): Target (w, h).
-        return_scale (bool): Whether to return `w_scale` and `h_scale`.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos".
-    Returns:
-        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
-    """
-    h, w = img.shape[:2]
-    resized_img = cv2.resize(img, (size[0], size[1]),
-                             interpolation=cv2.INTER_LINEAR)
-    if not return_scale:
-        return resized_img
-    else:
-        w_scale = size[0] / w
-        h_scale = size[1] / h
-        return resized_img, w_scale, h_scale
-
-
-def get_transform(center, scale, res, rot=0):
-    """Generate transformation matrix."""
-    h = 200 * scale
-    t = np.zeros((3, 3))
-    t[0, 0] = float(res[1]) / h
-    t[1, 1] = float(res[0]) / h
-    t[0, 2] = res[1] * (-float(center[0]) / h + .5)
-    t[1, 2] = res[0] * (-float(center[1]) / h + .5)
-    t[2, 2] = 1
-    if not rot == 0:
-        rot = -rot  # To match direction of rotation from cropping
-        rot_mat = np.zeros((3, 3))
-        rot_rad = rot * np.pi / 180
-        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
-        rot_mat[0, :2] = [cs, -sn]
-        rot_mat[1, :2] = [sn, cs]
-        rot_mat[2, 2] = 1
-        # Need to rotate around center
-        t_mat = np.eye(3)
-        t_mat[0, 2] = -res[1] / 2
-        t_mat[1, 2] = -res[0] / 2
-        t_inv = t_mat.copy()
-        t_inv[:2, 2] *= -1
-        t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
-    return t
-
-
-def transform(pt, center, scale, res, invert=0, rot=0):
-    """Transform pixel location to different reference."""
-    t = get_transform(center, scale, res, rot=rot)
-    if invert:
-        # t = np.linalg.inv(t)
-        t_torch = torch.from_numpy(t)
-        t_torch = torch.inverse(t_torch)
-        t = t_torch.numpy()
-    new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T
-    new_pt = np.dot(t, new_pt)
-    return new_pt[:2].astype(int) + 1
-
-
-def crop(img, center, scale, res, rot=0):
-    """Crop image according to the supplied bounding box."""
-    # Upper left point
-    ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1
-    # Bottom right point
-    br = np.array(
-        transform([res[0] + 1, res[1] + 1], center, scale, res, invert=1)) - 1
-    # Padding so that when rotated proper amount of context is included
-    pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2)
-    print('pad:', pad, 'rot:', rot, 'ul:', ul, "br:", br)
-    if not rot == 0:
-        ul -= pad
-        br += pad
-    new_shape = [br[1] - ul[1], br[0] - ul[0]]
-    if len(img.shape) > 2:
-        new_shape += [img.shape[2]]
-    new_img = np.zeros(new_shape)
-    print('new_shape:', new_shape, ' old_shape:', img.shape)
-
-    # Range to fill new array
-    new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0]
-    new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1]
-    # Range to sample from original image
-    old_x = max(0, ul[0]), min(len(img[0]), br[0])
-    old_y = max(0, ul[1]), min(len(img), br[1])
-
-    new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1],
-                                                        old_x[0]:old_x[1]]
-    if not rot == 0:
-        # Remove padding
-        # new_img = scipy.misc.imrotate(new_img, rot)
-        new_img = myimrotate(new_img, rot)
-        new_img = new_img[pad:-pad, pad:-pad]
-
-    # new_img = scipy.misc.imresize(new_img, res)
-    new_img = myimresize(new_img, [res[0], res[1]])
-    return new_img
-
-
-def uncrop(img, center, scale, orig_shape, rot=0, is_rgb=True):
-    """'Undo' the image cropping/resizing.
-    This function is used when evaluating mask/part segmentation.
-    """
-    res = img.shape[:2]
-    # Upper left point
-    ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1
-    # Bottom right point
-    br = np.array(
-        transform([res[0] + 1, res[1] + 1], center, scale, res, invert=1)) - 1
-    # size of cropped image
-    crop_shape = [br[1] - ul[1], br[0] - ul[0]]
-
-    new_shape = [br[1] - ul[1], br[0] - ul[0]]
-    if len(img.shape) > 2:
-        new_shape += [img.shape[2]]
-    new_img = np.zeros(orig_shape, dtype=np.uint8)
-    # Range to fill new array
-    new_x = max(0, -ul[0]), min(br[0], orig_shape[1]) - ul[0]
-    new_y = max(0, -ul[1]), min(br[1], orig_shape[0]) - ul[1]
-    # Range to sample from original image
-    old_x = max(0, ul[0]), min(orig_shape[1], br[0])
-    old_y = max(0, ul[1]), min(orig_shape[0], br[1])
-    # img = scipy.misc.imresize(img, crop_shape, interp='nearest')
-    img = myimresize(img, [crop_shape[0], crop_shape[1]])
-    new_img[old_y[0]:old_y[1], old_x[0]:old_x[1]] = img[new_y[0]:new_y[1],
-                                                        new_x[0]:new_x[1]]
-    return new_img
-
-
-def rot_aa(aa, rot):
-    """Rotate axis angle parameters."""
-    # pose parameters
-    R = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
-                  [np.sin(np.deg2rad(-rot)),
-                   np.cos(np.deg2rad(-rot)), 0], [0, 0, 1]])
-    # find the rotation of the body in camera frame
-    per_rdg, _ = cv2.Rodrigues(aa)
-    # apply the global rotation to the global orientation
-    resrot, _ = cv2.Rodrigues(np.dot(R, per_rdg))
-    aa = (resrot.T)[0]
-    return aa
-
-
-def flip_img(img):
-    """Flip rgb images or masks.
-    channels come last, e.g. (256,256,3).
-    """
-    img = np.fliplr(img)
-    return img
-
-
-def flip_kp(kp):
-    """Flip keypoints."""
-    flipped_parts = [
-        5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, 18, 19,
-        21, 20, 23, 22
-    ]
-    kp = kp[flipped_parts]
-    kp[:, 0] = -kp[:, 0]
-    return kp
-
-
-def flip_pose(pose):
-    """Flip pose.
-    The flipping is based on SMPL parameters.
-    """
-    flippedParts = [
-        0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19,
-        20, 24, 25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37,
-        38, 42, 43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58,
-        59, 54, 55, 56, 63, 64, 65, 60, 61, 62, 69, 70, 71, 66, 67, 68
-    ]
-    pose = pose[flippedParts]
-    # we also negate the second and the third dimension of the axis-angle
-    pose[1::3] = -pose[1::3]
-    pose[2::3] = -pose[2::3]
-    return pose
-
-
-def flip_aa(aa):
-    """Flip axis-angle representation.
-    We negate the second and the third dimension of the axis-angle.
-    """
-    aa[1] = -aa[1]
-    aa[2] = -aa[2]
-    return aa
\ No newline at end of file
diff --git a/AVLFormer/src/datasets/data_utils/video_decoder.py b/AVLFormer/src/datasets/data_utils/video_decoder.py
deleted file mode 100644
index c0c29f6..0000000
--- a/AVLFormer/src/datasets/data_utils/video_decoder.py
+++ /dev/null
@@ -1,359 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-# reference https://github.com/facebookresearch/SlowFast/blob/master/slowfast/datasets/decoder.py
-
-import math
-import random
-
-import numpy as np
-import torch
-
-
-def temporal_sampling(frames, start_idx, end_idx, num_samples):
-    """
-    Given the start and end frame index, sample num_samples frames between
-    the start and end with equal interval.
-    Args:
-        frames (list(av.video.frame.VideoFrame)): a list of decoded video frames
-        start_idx (int): the index of the start frame.
-        end_idx (int): the index of the end frame.
-        num_samples (int): number of frames to sample.
-    Returns:
-        frames (tersor): a tensor of temporal sampled video frames, dimension is
-            `num clip frames` x `channel` x `height` x `width`.
-    """
-    index = torch.linspace(start_idx, end_idx, num_samples)
-    index = torch.clamp(index, 0, len(frames) - 1).long().tolist()
-    frames = [frames[idx] for idx in index]
-    return frames
-
-    # seq = np.arange(0,len(frames)).tolist()
-    # print('seq_len:', len(frames), 'num_samples:', num_samples)
-    # new_index = random.sample(seq,num_samples)
-    # new_index.sort()
-    # frames = [frames[idx] for idx in new_index]
-
-    # if len(frames)<=num_samples:
-    #     index = torch.linspace(start_idx, end_idx, num_samples)
-    #     index = torch.clamp(index, 0, len(frames) - 1).long().tolist()
-    #     frames = [frames[idx] for idx in index]
-    # else:
-    #     seq = np.arange(0,len(frames)).tolist()
-    #     # print('seq_len:', len(frames), 'num_samples:', num_samples)
-    #     new_index = random.sample(seq,num_samples)
-    #     new_index.sort()
-    #     frames = [frames[idx] for idx in new_index]
-
-    # return frames
-
-
-def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
-    """
-    Sample a clip of size clip_size from a video of size video_size and
-    return the indices of the first and last frame of the clip. If clip_idx is
-    -1, the clip is randomly sampled, otherwise uniformly split the video to
-    num_clips clips, and select the start and end index of clip_idx-th video
-    clip.
-    Args:
-        video_size (int): number of overall frames.
-        clip_size (int): size of the clip to sample from the frames.
-            i.e., #frames to get at the original frame rate.
-        clip_idx (int): if clip_idx is -1, perform random jitter sampling. If
-            clip_idx is larger than -1, uniformly split the video to num_clips
-            clips, and select the start and end index of the clip_idx-th video
-            clip.
-        num_clips (int): overall number of clips to uniformly sample from the
-            given video for testing.
-    Returns:
-        start_idx (int): the start frame index.
-        end_idx (int): the end frame index.
-    """
-    delta = max(video_size - clip_size, 0)
-    if clip_idx == -1:
-        # Random temporal sampling.
-        start_idx = random.uniform(0, delta)
-    else:
-        # Uniformly sample the clip with the given index.
-        start_idx = delta * clip_idx / num_clips
-    end_idx = start_idx + clip_size - 1
-    return start_idx, end_idx
-
-    # delta = max(video_size - clip_size, 0)
-    # if clip_idx == -1:
-    #     # Random temporal sampling.
-    #     start_idx = random.uniform(0, delta)
-    # else:
-    #     # Uniformly sample the clip with the given index.
-    #     start_idx = delta * clip_idx / num_clips
-    # if clip_idx == -1 and clip_size>int(video_size*0.7):
-    #     end_idx = start_idx + random.uniform(0, clip_size-start_idx) - 1
-    # else:
-    #     end_idx = start_idx + clip_size - 1
-    # return start_idx, end_idx
-
-
-def pyav_decode_stream(container,
-                       start_pts,
-                       end_pts,
-                       stream,
-                       stream_name,
-                       buffer_size=0):
-    """
-    Decode the video with PyAV decoder.
-    Args:
-        container (container): PyAV container.
-        start_pts (int): the starting Presentation TimeStamp to fetch the
-            video frames.
-        end_pts (int): the ending Presentation TimeStamp of the decoded frames.
-        stream (stream): PyAV stream.
-        stream_name (dict): a dictionary of streams. For example, {"video": 0}
-            means video stream at stream index 0.
-        buffer_size (int): number of additional frames to decode beyond end_pts.
-    Returns:
-        result (list): list of frames decoded.
-        max_pts (int): max Presentation TimeStamp of the video sequence.
-    """
-    # Seeking in the stream is imprecise. Thus, seek to an ealier PTS by a
-    # margin pts.
-    margin = 1024
-    seek_offset = max(start_pts - margin, 0)
-
-    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
-    frames = {}
-    buffer_count = 0
-    max_pts = 0
-    for frame in container.decode(**stream_name):
-        max_pts = max(max_pts, frame.pts)
-        if frame.pts < start_pts:
-            continue
-        if frame.pts <= end_pts:
-            frames[frame.pts] = frame
-        else:
-            buffer_count += 1
-            frames[frame.pts] = frame
-            if buffer_count >= buffer_size:
-                break
-    result = [frames[pts] for pts in sorted(frames)]
-    return result, max_pts
-
-
-def pyav_decode(container,
-                sampling_rate,
-                num_frames,
-                clip_idx,
-                num_clips=10,
-                target_fps=30,
-                safeguard_duration=False,
-                video_max_pts=None,
-                start=None,
-                end=None):
-    """
-    Convert the video from its original fps to the target_fps. If the video
-    support selective decoding (contain decoding information in the video head),
-    the perform temporal selective decoding and sample a clip from the video
-    with the PyAV decoder. If the video does not support selective decoding,
-    decode the entire video.
-
-    Args:
-        container (container): pyav container.
-        sampling_rate (int): frame sampling rate (interval between two sampled
-            frames.
-        num_frames (int): number of frames to sample.
-        clip_idx (int): if clip_idx is -1, perform random temporal sampling. If
-            clip_idx is larger than -1, uniformly split the video to num_clips
-            clips, and select the clip_idx-th video clip.
-            If clip_idx is -2, uniformly sample `num_frames` from the whole video
-            specified by `container`, ignore all the other args (e.g.,
-            sampling_rate, target_fps, etc.).
-        num_clips (int): overall number of clips to uniformly sample from the
-            given video.
-        target_fps (int): the input video may has different fps, convert it to
-            the target video fps before frame sampling.
-    Returns:
-        frames (tensor): decoded frames from the video. Return None if the no
-            video stream was found.
-        fps (float): the number of frames per second of the video.
-        decode_all_video (bool): If True, the entire video was decoded.
-    """
-    # Try to fetch the decoding information from the video head. Some of the
-    # videos does not support fetching the decoding information, for that case
-    # it will get None duration.
-    fps = float(container.streams.video[0].average_rate)
-    frames_length = container.streams.video[0].frames
-    duration = container.streams.video[0].duration
-    if duration == None:
-        # If failed to fetch the decoding information, decode the entire video.
-        decode_all_video = True
-        video_start_pts, video_end_pts = 0, math.inf
-        video_max_pts = None
-    else:
-        if container.streams.video and safeguard_duration:
-            if video_max_pts:
-                # reuse if possible, to improve efficiency
-                duration = video_max_pts
-            else:
-                # decode the whole video to get the last frame pts
-                _, max_pts = pyav_decode_stream(
-                    container,
-                    0,
-                    math.inf,
-                    container.streams.video[0],
-                    {"video": 0},
-                )
-                if max_pts < 0.8 * duration:
-                    print(
-                        f"max_frame_pts and duration mismatch:{max_pts} vs. {duration}"
-                    )
-                    duration = max_pts
-        video_max_pts = duration
-        # Perform selective decoding.
-        decode_all_video = False
-        clip_size = sampling_rate * num_frames / target_fps * fps
-        sample_clip_idx = clip_idx
-        sample_num_clips = num_clips
-        if clip_idx == -2:
-            # the sampled clip will be the entire video
-            clip_size = frames_length
-            sample_clip_idx = 0
-            sample_num_clips = 1
-        start_idx, end_idx = get_start_end_idx(
-            frames_length,
-            clip_size,
-            sample_clip_idx,
-            sample_num_clips,
-        )
-        timebase = duration / frames_length
-        video_start_pts = int(start_idx * timebase)
-        video_end_pts = int(end_idx * timebase)
-
-    frames = None
-    if start != None and end != None and duration != None:
-        timebase = duration / frames_length
-        video_start_pts = int(start * fps * timebase)
-        video_end_pts = int(end * fps * timebase)
-    elif start != None and end != None and duration == None and fps != None:
-        video_start_pts = int(start * fps)
-        video_end_pts = int(end * fps)
-    elif start != None and end != None and duration == None:
-        video_start_pts = int(start * 30)
-        video_end_pts = int(end * 30)
-    # frames = None
-    # if start!=None and end!=None and duration!=None:
-    #     timebase = duration / frames_length
-    #     offset = np.random.uniform(0,end/2-start/2)
-    #     video_start_pts = int((start+offset) * fps * timebase)
-    #     video_end_pts = int((end-offset) * fps * timebase)
-    # elif start!=None and end!=None and duration==None and fps!=None:
-    #     offset = np.random.uniform(0,end/2-start/2)
-    #     video_start_pts = int((start+offset) * fps)
-    #     video_end_pts = int((end-offset) * fps)
-    # elif start!=None and end!=None and duration==None:
-    #     offset = np.random.uniform(0,end/2-start/2)
-    #     video_start_pts = int((start+offset) * 30)
-    #     video_end_pts = int((end-offset) * 30)
-    # print('video_start_pts:', video_start_pts)
-    # print('video_end_pts:', video_end_pts)
-    # If video stream was found, fetch video frames from the video.
-    if container.streams.video:
-        video_frames, max_pts = pyav_decode_stream(
-            container,
-            video_start_pts,
-            video_end_pts,
-            container.streams.video[0],
-            {"video": 0},
-        )
-
-        frames = video_frames
-        # move to after frame sampling
-        # frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
-        # frames = torch.as_tensor(np.stack(frames))
-    return frames, fps, decode_all_video, video_max_pts
-
-
-def decode(
-    container,
-    sampling_rate,
-    num_frames,
-    clip_idx=-1,
-    num_clips=10,
-    video_meta=None,
-    target_fps=30,
-    backend="pyav",
-    max_spatial_scale=0,
-    safeguard_duration=False,
-    video_max_pts=None,
-    start=None,
-    end=None,
-):
-    """
-    Decode the video and perform temporal sampling.
-    Args:
-        container (container): pyav container.
-        sampling_rate (int): frame sampling rate (interval between two sampled
-            frames).
-        num_frames (int): number of frames to sample.
-        clip_idx (int): if clip_idx is -1, perform random temporal
-            sampling. If clip_idx is larger than -1, uniformly split the
-            video to num_clips clips, and select the
-            clip_idx-th video clip.
-        num_clips (int): overall number of clips to uniformly
-            sample from the given video.
-        video_meta (dict): a dict contains VideoMetaData. Details can be find
-            at `pytorch/vision/torchvision/io/_video_opt.py`.
-        target_fps (int): the input video may have different fps, convert it to
-            the target video fps before frame sampling.
-        backend (str): decoding backend includes `pyav` and `torchvision`. The
-            default one is `pyav`.
-        max_spatial_scale (int): keep the aspect ratio and resize the frame so
-            that shorter edge size is max_spatial_scale. Only used in
-            `torchvision` backend.
-    Returns:
-        frames (tensor): decoded frames from the video.
-    """
-    # Currently support two decoders: 1) PyAV, and 2) TorchVision.
-    assert clip_idx >= -2, "Not valied clip_idx {}".format(clip_idx)
-    try:
-        if backend == "pyav":
-            frames, fps, decode_all_video, video_max_pts = pyav_decode(
-                container,
-                sampling_rate,
-                num_frames,
-                clip_idx,
-                num_clips,
-                target_fps,
-                safeguard_duration=safeguard_duration,
-                video_max_pts=video_max_pts,
-                start=start,
-                end=end)
-        else:
-            raise NotImplementedError(
-                "Unknown decoding backend {}".format(backend))
-    except Exception as e:
-        print("Failed to decode by {} with exception: {}".format(backend, e))
-        print("Failed to decode the video: {}".format(container.name))
-        container.close()
-        return None, video_max_pts
-
-    # Return None if the frames was not decoded successfully.
-    if frames is None or len(frames) == 0:
-        container.close()
-        return None, video_max_pts
-    clip_size = sampling_rate * num_frames / target_fps * fps
-    sample_clip_idx = clip_idx
-    sample_num_clips = num_clips
-    if clip_idx == -2:
-        clip_size = len(frames)
-        sample_clip_idx = 0
-        sample_num_clips = 1
-
-    start_idx, end_idx = get_start_end_idx(
-        len(frames),
-        clip_size,
-        sample_clip_idx if decode_all_video else 0,
-        sample_num_clips if decode_all_video else 1,
-    )
-    # Perform temporal sampling from the decoded video.
-    frames = temporal_sampling(frames, start_idx, end_idx, num_frames)
-    frames = [frame.to_rgb().to_ndarray() for frame in frames]
-    frames = torch.as_tensor(np.stack(frames))
-    return frames, video_max_pts
\ No newline at end of file
diff --git a/AVLFormer/src/datasets/data_utils/video_functional.py b/AVLFormer/src/datasets/data_utils/video_functional.py
deleted file mode 100644
index 0ee9668..0000000
--- a/AVLFormer/src/datasets/data_utils/video_functional.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import numbers
-
-import PIL
-from PIL import Image
-import cv2
-import numpy as np
-import torch
-
-
-def _is_tensor_clip(clip):
-    return torch.is_tensor(clip) and clip.ndimension() == 4
-
-
-def crop_clip(clip, min_h, min_w, h, w):
-    if isinstance(clip[0], np.ndarray):
-        cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]
-
-    elif isinstance(clip[0], PIL.Image.Image):
-        cropped = [
-            img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
-        ]
-    else:
-        raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                        'but got list of {0}'.format(type(clip[0])))
-    return cropped
-
-
-def to_grayscale(img, num_output_channels=1):
-    """Convert image to grayscale version of image.
-
-    Args:
-        img (PIL Image): Image to be converted to grayscale.
-
-    Returns:
-        PIL Image: Grayscale version of the image.
-            if num_output_channels = 1 : returned image is single channel
-
-            if num_output_channels = 3 : returned image is 3 channel with r = g = b
-    """
-    if not isinstance(img, PIL.Image.Image):
-        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
-
-    if num_output_channels == 1:
-        img = img.convert('L')
-    elif num_output_channels == 3:
-        img = img.convert('L')
-        np_img = np.array(img, dtype=np.uint8)
-        np_img = np.dstack([np_img, np_img, np_img])
-        img = Image.fromarray(np_img, 'RGB')
-    else:
-        raise ValueError('num_output_channels should be either 1 or 3')
-
-    return img
-
-
-def resize_clip(clip, size, interpolation='bilinear'):
-    if isinstance(clip[0], np.ndarray):
-        if isinstance(size, numbers.Number):
-            im_h, im_w, im_c = clip[0].shape
-            # Min spatial dim already matches minimal size
-            if (im_w <= im_h and im_w == size) or (im_h <= im_w
-                                                   and im_h == size):
-                return clip
-            new_h, new_w = get_resize_sizes(im_h, im_w, size)
-            size = (new_w, new_h)
-        else:
-            size = size[1], size[0]
-        if interpolation == 'bilinear':
-            np_inter = cv2.INTER_LINEAR
-        else:
-            np_inter = cv2.INTER_NEAREST
-        scaled = [
-            cv2.resize(img, size, interpolation=np_inter) for img in clip
-        ]
-    elif isinstance(clip[0], PIL.Image.Image):
-        if isinstance(size, numbers.Number):
-            im_w, im_h = clip[0].size
-            # Min spatial dim already matches minimal size
-            if (im_w <= im_h and im_w == size) or (im_h <= im_w
-                                                   and im_h == size):
-                return clip
-            new_h, new_w = get_resize_sizes(im_h, im_w, size)
-            size = (new_w, new_h)
-        else:
-            size = size[1], size[0]
-        if interpolation == 'bilinear':
-            pil_inter = PIL.Image.NEAREST
-        else:
-            pil_inter = PIL.Image.BILINEAR
-        scaled = [img.resize(size, pil_inter) for img in clip]
-    else:
-        raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                        'but got list of {0}'.format(type(clip[0])))
-    return scaled
-
-
-def get_resize_sizes(im_h, im_w, size):
-    if im_w < im_h:
-        ow = size
-        oh = int(size * im_h / im_w)
-    else:
-        oh = size
-        ow = int(size * im_w / im_h)
-    return oh, ow
-
-
-def normalize(clip, mean, std, inplace=False):
-    if not _is_tensor_clip(clip):
-        raise TypeError('tensor is not a torch clip.')
-
-    if not inplace:
-        clip = clip.clone()
-
-    dtype = clip.dtype
-    mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
-    std = torch.as_tensor(std, dtype=dtype, device=clip.device)
-    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
-
-    return clip
diff --git a/AVLFormer/src/datasets/data_utils/video_ops.py b/AVLFormer/src/datasets/data_utils/video_ops.py
deleted file mode 100644
index 300a3b1..0000000
--- a/AVLFormer/src/datasets/data_utils/video_ops.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import code
-import io
-
-from PIL import Image
-import av
-import numpy as np
-from src.datasets.data_utils import video_decoder as decoder
-import torch
-
-
-def get_video_decoding_kwargs(container,
-                              num_frames,
-                              target_fps,
-                              num_clips=None,
-                              clip_idx=None,
-                              sampling_strategy="rand",
-                              safeguard_duration=False,
-                              video_max_pts=None,
-                              start=None,
-                              end=None):
-    if num_clips is None:
-        three_clip_names = ["start", "middle", "end"]  # uniformly 3 clips
-        assert sampling_strategy in ["rand", "uniform"] + three_clip_names
-        if sampling_strategy == "rand":
-            decoder_kwargs = dict(
-                container=container,
-                sampling_rate=1,
-                num_frames=num_frames,
-                clip_idx=-1,  # random sampling
-                num_clips=None,  # will not be used when clip_idx is `-1`
-                target_fps=target_fps,
-                start=start,
-                end=end)
-        elif sampling_strategy == "uniform":
-            decoder_kwargs = dict(
-                container=container,
-                sampling_rate=1,  # will not be used when clip_idx is `-2`
-                num_frames=num_frames,
-                clip_idx=-2,  # uniformly sampling from the whole video
-                num_clips=1,  # will not be used when clip_idx is `-2`
-                target_fps=target_fps,  # will not be used when clip_idx is `-2`
-                start=start,
-                end=end)
-        else:  # in three_clip_names
-            decoder_kwargs = dict(
-                container=container,
-                sampling_rate=1,
-                num_frames=num_frames,
-                clip_idx=three_clip_names.index(sampling_strategy),
-                num_clips=3,
-                target_fps=target_fps,
-                start=start,
-                end=end)
-    else:  # multi_clip_ensemble, num_clips and clip_idx are only used here
-        assert clip_idx is not None
-        # sampling_strategy will not be used, as uniform sampling will be used by default.
-        # uniformly sample `num_clips` from the video,
-        # each clip sample num_frames frames at target_fps.
-        decoder_kwargs = dict(container=container,
-                              sampling_rate=1,
-                              num_frames=num_frames,
-                              clip_idx=clip_idx,
-                              num_clips=num_clips,
-                              target_fps=target_fps,
-                              safeguard_duration=safeguard_duration,
-                              video_max_pts=video_max_pts,
-                              start=start,
-                              end=end)
-    return decoder_kwargs
-
-
-def extract_frames_from_video_path(video_path,
-                                   target_fps=3,
-                                   num_frames=3,
-                                   multi_thread_decode=False,
-                                   sampling_strategy="rand",
-                                   safeguard_duration=False,
-                                   start=None,
-                                   end=None):
-    in_mem_bytes_io = video_path
-    try:
-        frames, video_max_pts = extract_frames_from_video_binary(
-            in_mem_bytes_io,
-            target_fps=target_fps,
-            num_frames=num_frames,
-            multi_thread_decode=multi_thread_decode,
-            sampling_strategy=sampling_strategy,
-            safeguard_duration=safeguard_duration,
-            start=start,
-            end=end)
-    except Exception as e:
-        print(f"Error processing video {video_path}, {e}")
-        return None, None
-    return frames, video_max_pts
-
-
-def extract_frames_from_video_binary(in_mem_bytes_io,
-                                     target_fps=3,
-                                     num_frames=3,
-                                     num_clips=None,
-                                     clip_idx=None,
-                                     multi_thread_decode=False,
-                                     sampling_strategy="rand",
-                                     safeguard_duration=False,
-                                     video_max_pts=None,
-                                     start=None,
-                                     end=None):
-    """
-    Args:
-        in_mem_bytes_io: binary from read file object
-            >>> with open(video_path, "rb") as f:
-            >>>     input_bytes = f.read()
-            >>> frames = extract_frames_from_video_binary(input_bytes)
-            OR from saved binary in lmdb database
-            >>> env = lmdb.open("lmdb_dir", readonly=True)
-            >>> txn = env.begin()
-            >>> stream = io.BytesIO(txn.get(str("key").encode("utf-8")))
-            >>> frames = extract_frames_from_video_binary(stream)
-            >>> from torchvision.utils import save_image
-            >>> save_image(frames[0], "path/to/example.jpg")  # save the extracted frames.
-        target_fps: int, the input video may have different fps, convert it to
-            the target video fps before frame sampling.
-        num_frames: int, number of frames to sample.
-        multi_thread_decode: bool, if True, perform multi-thread decoding.
-        sampling_strategy: str, how to sample frame from video, one of
-            ["rand", "uniform", "start", "middle", "end"]
-            `rand`: randomly sample consecutive num_frames from the video at target_fps
-                Note it randomly samples a clip containing num_frames at target_fps,
-                not uniformly sample from the whole video
-            `uniform`: uniformly sample num_frames of equal distance from the video, without
-                considering target_fps/sampling_rate, etc. E.g., when sampling_strategy=uniform
-                and num_frames=3, it samples 3 frames at [0, N/2-1, N-1] given a video w/ N frames.
-                However, note that when num_frames=1, it will sample 1 frame at [0].
-                Also note that `target_fps` will not be used under `uniform` sampling strategy.
-            `start`/`middle`/`end`: first uniformly segment the video into 3 clips, then sample
-                num_frames from the corresponding clip at target_fps. E.g., num_frames=3, a video
-                w/ 30 frames, it samples [0, 1, 2]; [9, 10, 11]; [18, 19, 20] for start/middle/end.
-            If the total #frames at target_fps in the video/clip is less than num_frames,
-            there will be some duplicated frames
-        num_clips: int,
-        clip_idx: int
-        safeguard_duration:
-        video_max_pts: resue it to improve efficiency
-    Returns:
-        torch.uint8, (T, C, H, W)
-    """
-    try:
-        # Add `metadata_errors="ignore"` to ignore metadata decoding error.
-        # When verified visually, it does not seem to affect the extracted frames.
-        video_container = av.open(in_mem_bytes_io, metadata_errors="ignore")
-    except Exception as e:
-        print(
-            f"extract_frames_from_video_binary(), Exception in loading video binary: {e}"
-        )
-        return None, None
-
-    if multi_thread_decode:
-        # Enable multiple threads for decoding.
-        video_container.streams.video[0].thread_type = "AUTO"
-    try:
-        # (T, H, W, C), channels are RGB
-        # see docs in decoder.decode for usage of these parameters.
-        decoder_kwargs = get_video_decoding_kwargs(
-            container=video_container,
-            num_frames=num_frames,
-            target_fps=target_fps,
-            num_clips=num_clips,
-            clip_idx=clip_idx,
-            sampling_strategy=sampling_strategy,
-            safeguard_duration=safeguard_duration,
-            video_max_pts=video_max_pts,
-            start=start,
-            end=end)
-        frames, video_max_pts = decoder.decode(**decoder_kwargs)
-    except Exception as e:
-        print(
-            f"extract_frames_from_video_binary(), Exception in decoding video: {e}"
-        )
-        return None, None
-
-    # For some reason in PyAV, the video container may not auto-close, and it could occupy computational resource
-    # check more details at https://pyav.org/docs/stable/overview/caveats.html#garbage-collection
-    video_container.close()
-
-    # (T, H, W, C) -> (T, C, H, W)
-    if frames is not None:
-        frames = frames.permute(0, 3, 1, 2)
-    return frames, video_max_pts
\ No newline at end of file
diff --git a/AVLFormer/src/datasets/data_utils/video_transforms.py b/AVLFormer/src/datasets/data_utils/video_transforms.py
deleted file mode 100644
index 346ee74..0000000
--- a/AVLFormer/src/datasets/data_utils/video_transforms.py
+++ /dev/null
@@ -1,555 +0,0 @@
-import math
-import numbers
-import random
-
-import PIL
-import numpy as np
-import skimage.transform
-import torch
-import torchvision
-
-from . import video_functional as F
-
-
-class Compose(object):
-    """Composes several transforms
-
-    Args:
-    transforms (list of ``Transform`` objects): list of transforms
-    to compose
-    """
-
-    def __init__(self, transforms):
-        self.transforms = transforms
-
-    def __call__(self, clip):
-        for t in self.transforms:
-            clip = t(clip)
-        return clip
-
-
-class RandomHorizontalFlip(object):
-    """Horizontally flip the list of given images randomly with a given probability.
-
-    Args:
-        p (float): probability of the image being flipped. Default value is 0.5
-    """
-
-    def __init__(self, p=0.5):
-        self.p = p
-
-    def __call__(self, clip):
-        """
-        Args:
-        img (PIL.Image or numpy.ndarray): List of images to be cropped
-        in format (h, w, c) in numpy.ndarray
-
-        Returns:
-        PIL.Image or numpy.ndarray: Randomly flipped clip
-        """
-        if random.random() < self.p:
-            if isinstance(clip[0], np.ndarray):
-                return [np.fliplr(img) for img in clip]
-            elif isinstance(clip[0], PIL.Image.Image):
-                return [
-                    img.transpose(PIL.Image.FLIP_LEFT_RIGHT) for img in clip
-                ]
-            else:
-                raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                                ' but got list of {0}'.format(type(clip[0])))
-        return clip
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(p={})'.format(self.p)
-
-
-class RandomVerticalFlip(object):
-    """Vertically flip the list of given images randomly with a given probability.
-
-    Args:
-        p (float): probability of the image being flipped. Default value is 0.5
-    """
-
-    def __init__(self, p=0.5):
-        self.p = p
-
-    def __call__(self, clip):
-        """
-
-        Args:
-            img (PIL.Image or numpy.ndarray): List of images to be flipped
-            in format (h, w, c) in numpy.ndarray
-
-        Returns:
-            PIL.Image or numpy.ndarray: Randomly flipped clip
-        """
-        if random.random() < self.p:
-            if isinstance(clip[0], np.ndarray):
-                return [np.flipud(img) for img in clip]
-            elif isinstance(clip[0], PIL.Image.Image):
-                return [
-                    img.transpose(PIL.Image.FLIP_TOP_BOTTOM) for img in clip
-                ]
-            else:
-                raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                                ' but got list of {0}'.format(type(clip[0])))
-        return clip
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(p={})'.format(self.p)
-
-
-class RandomGrayscale(object):
-    """Randomly convert image to grayscale with a probability of p (default 0.1).
-    The image can be a PIL Image or a Tensor, in which case it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading
-    dimensions
-    Args:
-        p (float): probability that image should be converted to grayscale.
-    Returns:
-        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
-        with probability (1-p).
-        - If input image is 1 channel: grayscale version is 1 channel
-        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
-    """
-
-    def __init__(self, p=0.1):
-        super().__init__()
-        self.p = p
-
-    def __call__(self, clip):
-        """
-        Args:
-            list of imgs (PIL Image or Tensor): Image to be converted to grayscale.
-        Returns:
-            PIL Image or Tensor: Randomly grayscaled image.
-        """
-        num_output_channels = 1 if clip[0].mode == 'L' else 3
-        if torch.rand(1) < self.p:
-            for i in range(len(clip)):
-                clip[i] = F.to_grayscale(clip[i], num_output_channels)
-        return clip
-
-
-class RandomResize(object):
-    """Resizes a list of (H x W x C) numpy.ndarray to the final size
-
-    The larger the original image is, the more times it takes to
-    interpolate
-
-    Args:
-    interpolation (str): Can be one of 'nearest', 'bilinear'
-    defaults to nearest
-    size (tuple): (widht, height)
-    """
-
-    def __init__(self, ratio=(3. / 4., 4. / 3.), interpolation='nearest'):
-        self.ratio = ratio
-        self.interpolation = interpolation
-
-    def __call__(self, clip):
-        scaling_factor = random.uniform(self.ratio[0], self.ratio[1])
-
-        if isinstance(clip[0], np.ndarray):
-            im_h, im_w, im_c = clip[0].shape
-        elif isinstance(clip[0], PIL.Image.Image):
-            im_w, im_h = clip[0].size
-
-        new_w = int(im_w * scaling_factor)
-        new_h = int(im_h * scaling_factor)
-        new_size = (new_w, new_h)
-        resized = F.resize_clip(clip,
-                                new_size,
-                                interpolation=self.interpolation)
-        return resized
-
-
-class Resize(object):
-    """Resizes a list of (H x W x C) numpy.ndarray to the final size
-
-    The larger the original image is, the more times it takes to
-    interpolate
-
-    Args:
-    interpolation (str): Can be one of 'nearest', 'bilinear'
-    defaults to nearest
-    size (tuple): (widht, height)
-    """
-
-    def __init__(self, size, interpolation='nearest'):
-        self.size = size
-        self.interpolation = interpolation
-
-    def __call__(self, clip):
-        resized = F.resize_clip(clip,
-                                self.size,
-                                interpolation=self.interpolation)
-        return resized
-
-
-class RandomCrop(object):
-    """Extract random crop at the same location for a list of images
-
-    Args:
-    size (sequence or int): Desired output size for the
-    crop in format (h, w)
-    """
-
-    def __init__(self, size):
-        if isinstance(size, numbers.Number):
-            size = (size, size)
-
-        self.size = size
-
-    def __call__(self, clip):
-        """
-        Args:
-        img (PIL.Image or numpy.ndarray): List of images to be cropped
-        in format (h, w, c) in numpy.ndarray
-
-        Returns:
-        PIL.Image or numpy.ndarray: Cropped list of images
-        """
-        h, w = self.size
-        if isinstance(clip[0], np.ndarray):
-            im_h, im_w, im_c = clip[0].shape
-        elif isinstance(clip[0], PIL.Image.Image):
-            im_w, im_h = clip[0].size
-        else:
-            raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                            'but got list of {0}'.format(type(clip[0])))
-        if w > im_w or h > im_h:
-            error_msg = (
-                'Initial image size should be larger then '
-                'cropped size but got cropped sizes : ({w}, {h}) while '
-                'initial image is ({im_w}, {im_h})'.format(im_w=im_w,
-                                                           im_h=im_h,
-                                                           w=w,
-                                                           h=h))
-            raise ValueError(error_msg)
-
-        x1 = random.randint(0, im_w - w)
-        y1 = random.randint(0, im_h - h)
-        cropped = F.crop_clip(clip, y1, x1, h, w)
-
-        return cropped
-
-
-class RandomResizedCrop(object):
-    """Crop the given list of PIL Images to random size and aspect ratio.
-
-    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
-    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
-    is finally resized to given size.
-    This is popularly used to train the Inception networks.
-
-    Args:
-        size: expected output size of each edge
-        scale: range of size of the origin size cropped
-        ratio: range of aspect ratio of the origin aspect ratio cropped
-        interpolation: Default: PIL.Image.BILINEAR
-    """
-
-    def __init__(self,
-                 size,
-                 scale=(0.08, 1.0),
-                 ratio=(3. / 4., 4. / 3.),
-                 interpolation='bilinear'):
-        if isinstance(size, (tuple, list)):
-            self.size = size
-        else:
-            self.size = (size, size)
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("range should be of kind (min, max)")
-
-        self.interpolation = interpolation
-        self.scale = scale
-        self.ratio = ratio
-
-    @staticmethod
-    def get_params(clip, scale, ratio):
-        """Get parameters for ``crop`` for a random sized crop.
-
-        Args:
-            img (list of PIL Image): Image to be cropped.
-            scale (tuple): range of size of the origin size cropped
-            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
-
-        Returns:
-            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
-                sized crop.
-        """
-        if isinstance(clip[0], np.ndarray):
-            height, width, im_c = clip[0].shape
-        elif isinstance(clip[0], PIL.Image.Image):
-            width, height = clip[0].size
-        area = height * width
-
-        for _ in range(10):
-            target_area = random.uniform(*scale) * area
-            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
-            aspect_ratio = math.exp(random.uniform(*log_ratio))
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if 0 < w <= width and 0 < h <= height:
-                i = random.randint(0, height - h)
-                j = random.randint(0, width - w)
-                return i, j, h, w
-
-        # Fallback to central crop
-        in_ratio = float(width) / float(height)
-        if (in_ratio < min(ratio)):
-            w = width
-            h = int(round(w / min(ratio)))
-        elif (in_ratio > max(ratio)):
-            h = height
-            w = int(round(h * max(ratio)))
-        else:  # whole image
-            w = width
-            h = height
-        i = (height - h) // 2
-        j = (width - w) // 2
-        return i, j, h, w
-
-    def __call__(self, clip):
-        """
-        Args:
-            clip: list of img (PIL Image): Image to be cropped and resized.
-
-        Returns:
-            list of PIL Image: Randomly cropped and resized image.
-        """
-        i, j, h, w = self.get_params(clip, self.scale, self.ratio)
-        imgs = F.crop_clip(clip, i, j, h, w)
-        return F.resize_clip(clip, self.size, self.interpolation)
-        # return F.resized_crop(img, i, j, h, w, self.size, self.interpolation)
-
-    def __repr__(self):
-        interpolate_str = _pil_interpolation_to_str[self.interpolation]
-        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
-        format_string += ', scale={0}'.format(
-            tuple(round(s, 4) for s in self.scale))
-        format_string += ', ratio={0}'.format(
-            tuple(round(r, 4) for r in self.ratio))
-        format_string += ', interpolation={0})'.format(interpolate_str)
-        return format_string
-
-
-class RandomRotation(object):
-    """Rotate entire clip randomly by a random angle within
-    given bounds
-
-    Args:
-    degrees (sequence or int): Range of degrees to select from
-    If degrees is a number instead of sequence like (min, max),
-    the range of degrees, will be (-degrees, +degrees).
-
-    """
-
-    def __init__(self, degrees):
-        if isinstance(degrees, numbers.Number):
-            if degrees < 0:
-                raise ValueError('If degrees is a single number,'
-                                 'must be positive')
-            degrees = (-degrees, degrees)
-        else:
-            if len(degrees) != 2:
-                raise ValueError('If degrees is a sequence,'
-                                 'it must be of len 2.')
-
-        self.degrees = degrees
-
-    def __call__(self, clip):
-        """
-        Args:
-        img (PIL.Image or numpy.ndarray): List of images to be cropped
-        in format (h, w, c) in numpy.ndarray
-
-        Returns:
-        PIL.Image or numpy.ndarray: Cropped list of images
-        """
-        angle = random.uniform(self.degrees[0], self.degrees[1])
-        if isinstance(clip[0], np.ndarray):
-            rotated = [skimage.transform.rotate(img, angle) for img in clip]
-        elif isinstance(clip[0], PIL.Image.Image):
-            rotated = [img.rotate(angle) for img in clip]
-        else:
-            raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                            'but got list of {0}'.format(type(clip[0])))
-
-        return rotated
-
-
-class CenterCrop(object):
-    """Extract center crop at the same location for a list of images
-
-    Args:
-    size (sequence or int): Desired output size for the
-    crop in format (h, w)
-    """
-
-    def __init__(self, size):
-        if isinstance(size, numbers.Number):
-            size = (size, size)
-
-        self.size = size
-
-    def __call__(self, clip):
-        """
-        Args:
-        img (PIL.Image or numpy.ndarray): List of images to be cropped
-        in format (h, w, c) in numpy.ndarray
-
-        Returns:
-        PIL.Image or numpy.ndarray: Cropped list of images
-        """
-        h, w = self.size
-        if isinstance(clip[0], np.ndarray):
-            im_h, im_w, im_c = clip[0].shape
-        elif isinstance(clip[0], PIL.Image.Image):
-            im_w, im_h = clip[0].size
-        else:
-            raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                            'but got list of {0}'.format(type(clip[0])))
-        if w > im_w or h > im_h:
-            error_msg = (
-                'Initial image size should be larger then '
-                'cropped size but got cropped sizes : ({w}, {h}) while '
-                'initial image is ({im_w}, {im_h})'.format(im_w=im_w,
-                                                           im_h=im_h,
-                                                           w=w,
-                                                           h=h))
-            raise ValueError(error_msg)
-
-        x1 = int(round((im_w - w) / 2.))
-        y1 = int(round((im_h - h) / 2.))
-        cropped = F.crop_clip(clip, y1, x1, h, w)
-
-        return cropped
-
-
-class ColorJitter(object):
-    """Randomly change the brightness, contrast and saturation and hue of the clip
-
-    Args:
-    brightness (float): How much to jitter brightness. brightness_factor
-    is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
-    contrast (float): How much to jitter contrast. contrast_factor
-    is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
-    saturation (float): How much to jitter saturation. saturation_factor
-    is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
-    hue(float): How much to jitter hue. hue_factor is chosen uniformly from
-    [-hue, hue]. Should be >=0 and <= 0.5.
-    """
-
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
-        self.brightness = brightness
-        self.contrast = contrast
-        self.saturation = saturation
-        self.hue = hue
-
-    def get_params(self, brightness, contrast, saturation, hue):
-        if brightness > 0:
-            brightness_factor = random.uniform(max(0, 1 - brightness),
-                                               1 + brightness)
-        else:
-            brightness_factor = None
-
-        if contrast > 0:
-            contrast_factor = random.uniform(max(0, 1 - contrast),
-                                             1 + contrast)
-        else:
-            contrast_factor = None
-
-        if saturation > 0:
-            saturation_factor = random.uniform(max(0, 1 - saturation),
-                                               1 + saturation)
-        else:
-            saturation_factor = None
-
-        if hue > 0:
-            hue_factor = random.uniform(-hue, hue)
-        else:
-            hue_factor = None
-        return brightness_factor, contrast_factor, saturation_factor, hue_factor
-
-    def __call__(self, clip):
-        """
-        Args:
-        clip (list): list of PIL.Image
-
-        Returns:
-        list PIL.Image : list of transformed PIL.Image
-        """
-        if isinstance(clip[0], np.ndarray):
-            raise TypeError(
-                'Color jitter not yet implemented for numpy arrays')
-        elif isinstance(clip[0], PIL.Image.Image):
-            brightness, contrast, saturation, hue = self.get_params(
-                self.brightness, self.contrast, self.saturation, self.hue)
-
-            # Create img transform function sequence
-            img_transforms = []
-            if brightness is not None:
-                img_transforms.append(
-                    lambda img: torchvision.transforms.functional.
-                    adjust_brightness(img, brightness))
-            if saturation is not None:
-                img_transforms.append(
-                    lambda img: torchvision.transforms.functional.
-                    adjust_saturation(img, saturation))
-            if hue is not None:
-                img_transforms.append(lambda img: torchvision.transforms.
-                                      functional.adjust_hue(img, hue))
-            if contrast is not None:
-                img_transforms.append(
-                    lambda img: torchvision.transforms.functional.
-                    adjust_contrast(img, contrast))
-            random.shuffle(img_transforms)
-
-            # Apply to all images
-            jittered_clip = []
-            for img in clip:
-                for func in img_transforms:
-                    jittered_img = func(img)
-                jittered_clip.append(jittered_img)
-
-        else:
-            raise TypeError('Expected numpy.ndarray or PIL.Image' +
-                            'but got list of {0}'.format(type(clip[0])))
-        return jittered_clip
-
-
-class Normalize(object):
-    """Normalize a clip with mean and standard deviation.
-    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
-    will normalize each channel of the input ``torch.*Tensor`` i.e.
-    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
-
-    .. note::
-        This transform acts out of place, i.e., it does not mutates the input tensor.
-
-    Args:
-        mean (sequence): Sequence of means for each channel.
-        std (sequence): Sequence of standard deviations for each channel.
-    """
-
-    def __init__(self, mean, std):
-        self.mean = mean
-        self.std = std
-
-    def __call__(self, clip):
-        """
-        Args:
-            clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized.
-
-        Returns:
-            Tensor: Normalized Tensor clip.
-        """
-        return F.normalize(clip, self.mean, self.std)
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(mean={0}, std={1})'.format(
-            self.mean, self.std)
diff --git a/AVLFormer/src/datasets/data_utils/volume_transforms.py b/AVLFormer/src/datasets/data_utils/volume_transforms.py
deleted file mode 100644
index 4abe2a3..0000000
--- a/AVLFormer/src/datasets/data_utils/volume_transforms.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from PIL import Image
-import numpy as np
-import torch
-
-
-def my_convert_img(img):
-    """Converts (H, W, C) numpy.ndarray to (C, W, H) format
-    """
-    if len(img.shape) == 3:
-        img = img.transpose(2, 0, 1)
-    if len(img.shape) == 2:
-        img = np.expand_dims(img, 0)
-    return img
-
-
-class ClipToTensor(object):
-    """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
-    to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
-    """
-
-    def __init__(self, channel_nb=3, div_255=True, numpy=False):
-        self.channel_nb = channel_nb
-        self.div_255 = div_255
-        self.numpy = numpy
-
-    def __call__(self, clip):
-        """
-        Args: clip (list of numpy.ndarray): clip (list of images)
-        to be converted to tensor.
-        """
-        # Retrieve shape
-        if isinstance(clip[0], np.ndarray):
-            h, w, ch = clip[0].shape
-            assert ch == self.channel_nb, 'Got {0} instead of 3 channels'.format(
-                ch)
-        elif isinstance(clip[0], Image.Image):
-            w, h = clip[0].size
-        else:
-            raise TypeError('Expected numpy.ndarray or PIL.Image\
-            but got list of {0}'.format(type(clip[0])))
-
-        np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])
-
-        # Convert
-        for img_idx, img in enumerate(clip):
-            if isinstance(img, np.ndarray):
-                pass
-            elif isinstance(img, Image.Image):
-                img = np.array(img, copy=False)
-            else:
-                raise TypeError('Expected numpy.ndarray or PIL.Image\
-                but got list of {0}'.format(type(clip[0])))
-            img = my_convert_img(img)
-            np_clip[:, img_idx, :, :] = img
-        if self.numpy:
-            if self.div_255:
-                np_clip = np_clip / 255
-            return np_clip
-
-        else:
-            tensor_clip = torch.from_numpy(np_clip)
-
-            if not isinstance(tensor_clip, torch.FloatTensor):
-                tensor_clip = tensor_clip.float()
-            if self.div_255:
-                tensor_clip = tensor_clip.div(255)
-            return tensor_clip
-
-
-class ToTensor(object):
-    """Converts numpy array to tensor
-    """
-
-    def __call__(self, array):
-        tensor = torch.from_numpy(array)
-        return tensor
diff --git a/AVLFormer/src/datasets/sampler_utils.py b/AVLFormer/src/datasets/sampler_utils.py
deleted file mode 100644
index 5133be3..0000000
--- a/AVLFormer/src/datasets/sampler_utils.py
+++ /dev/null
@@ -1,657 +0,0 @@
-import logging
-import math
-import os.path as op
-import time
-
-from src.utils.comm import get_local_rank, get_local_size, get_rank, get_world_size
-from src.utils.qd_common import exclusive_open_to_read
-from src.utils.tsv_file import load_list_file
-from src.utils.tsv_io import TSVDataset, get_tsv_lineidx, get_tsv_lineidx_8b
-import torch
-import torch.distributed as dist
-from torch.utils.data.sampler import Sampler
-
-
-class RankSplitSampler(Sampler):
-
-    def __init__(self, dataset, shuffle, random_seed):
-        self.dataset = dataset
-        self.shuffle = shuffle
-        self.random_seed = random_seed
-
-        self.world_size = get_world_size()
-        self.rank = get_rank()
-
-    def get_index(self):
-        source_list = self.dataset.get_composite_source_idx()
-        idx_split = list(enumerate(source_list))
-        idx_split = torch.tensor(idx_split)
-        if self.shuffle:
-            g = torch.Generator()
-            g.manual_seed(self.random_seed)
-            random_idx = torch.randperm(len(idx_split), generator=g)
-            idx_split = idx_split[random_idx]
-        sort_idx = torch.argsort(idx_split[:, 1])
-        idx_split = idx_split[sort_idx]
-        rank_size = (len(idx_split) + self.world_size - 1) // self.world_size
-        offset = rank_size * self.rank
-        offset_end = offset + rank_size
-        offset_end = min(offset_end, len(idx_split))
-        return idx_split[offset:offset_end, 0].tolist()
-
-    def __iter__(self):
-        self.curr_idx = 0
-        all_idx = self.get_index()
-        while True:
-            if self.curr_idx >= len(all_idx):
-                self.curr_idx -= len(all_idx)
-            yield all_idx[self.curr_idx]
-            self.curr_idx += 1
-
-    def __len__(self):
-        raise ValueError('should not be called')
-
-
-def create_prepare_tsv_file_process(max_len=8):
-    import queue
-    import threading
-    prepare_queue = queue.Queue()
-    p = threading.Thread(
-        target=prepare_tsv_file_process,
-        args=(prepare_queue, max_len),
-        daemon=True,
-    )
-    p.start()
-    return p, prepare_queue
-
-
-def prepare_tsv_file_process(queue, max_len=8):
-    ftype = 'blobfuse'
-    logging.info('ftype = {}'.format(ftype))
-
-    prepared = []
-
-    while True:
-        start = time.time()
-        fnames = queue.get()
-        end = time.time()
-        if (end - start) > 5:
-            logging.info(
-                'waiting {} to get a new tsv to prepare'.format(end - start))
-        curr_fs = []
-        for fname in fnames:
-            curr_fs.append(fname)
-            if fname.endswith('.tsv'):
-                lineidx = get_tsv_lineidx(fname)
-                from src.utils.tsv_io import QDFile
-                if QDFile.isfile(lineidx):
-                    curr_fs.append(lineidx)
-                lineidx8b = get_tsv_lineidx_8b(fname)
-                if QDFile.isfile(lineidx8b):
-                    curr_fs.append(lineidx8b)
-
-        def unprepare(info):
-            logging.info('unprepare {}'.format(info['fnames']))
-            if ftype == 'blobfuse':
-                for f in info['fps']:
-                    f.close()
-            logging.info('unprepared {}'.format(info['fnames']))
-
-        sames = [
-            i for i, p in enumerate(prepared)
-            if all(f in p['fnames'] for f in curr_fs)
-        ]
-        if len(sames) > 0:
-            i = sames[0]
-            p = prepared[i]
-            del prepared[i]
-            prepared.append(p)
-            logging.info(
-                'no need to prepare {} as it prepared'.format(curr_fs))
-            continue
-
-        while max_len > 0 and len(prepared) >= max_len:
-            unprepare(prepared.pop(0))
-
-        logging.info('prepare {}'.format(curr_fs))
-        start = time.time()
-        if ftype == 'blobfuse':
-            info = {
-                'fnames': curr_fs,
-                'fps': [exclusive_open_to_read(x) for x in curr_fs]
-            }
-        prepared.append(info)
-        logging.info('use {}s, prepared {}, all hold={}'.format(
-            time.time() - start,
-            curr_fs,
-            ', '.join([f for p in prepared for f in p['fnames']]),
-        ))
-
-
-def ordered_unique(sequence):
-    seen = set()
-    return [x for x in sequence if not (x in seen or seen.add(x))]
-
-
-class PrepareData(object):
-
-    def __init__(
-        self,
-        dataset,
-        prepare_t_versions=[],
-        fixed_samples_in_node=False,
-        disable_prepare=None,
-    ):
-        self.prepare_files = None
-        self.prepare_process = None
-        self.dataset = dataset
-        self.prepare_t_versions = prepare_t_versions
-        self.fixed_samples_in_node = fixed_samples_in_node
-        self.disable_prepare = disable_prepare
-
-    def get_composite_source_files(self):
-        root = self.dataset.root
-        assert self.dataset.is_composite
-        result = []
-        for t in ['visual_tsv', 'label_tsv', 'cap_tsv']:
-            tsv = getattr(self.dataset, t, None)
-            if tsv is not None:
-                result.append([op.join(root, f) for f in tsv.file_list])
-        return result
-
-    def prepare(self, split):
-        if self.disable_prepare:
-            return
-        self.ensure_init_prepare()
-        q = self.prepare_queue
-        size = q.qsize()
-        if size > 100:
-            logging.info('prepare queue is too long {}'.format(size))
-        q.put([ps[split] for ps in self.prepare_files])
-
-    def ensure_init_prepare(self):
-        if self.prepare_files is None:
-            self.prepare_files = self.get_composite_source_files()
-        if self.prepare_process is None:
-            max_len = 8 if not self.fixed_samples_in_node else 0
-            p, prepare_queue = create_prepare_tsv_file_process(max_len=max_len)
-            self.prepare_process = p
-            self.prepare_queue = prepare_queue
-
-
-class SplitBySplitSampler(Sampler):
-    # only used in training mode.
-    # prefer to use PrepareData(), but this class has already been used for a
-    # while and is working great. New approaches can leverage PrepareData, but
-    # at this moment, it is ok not to re-factor it
-    def __init__(
-        self,
-        dataset,
-        group_size=1,
-        shuffle=True,
-        fixed_samples_in_node=False,
-        random_seed=9,
-        prepare_t_versions=[],
-        disable_prepare=None,
-    ):
-        from src.utils.qd_common import print_frame_info
-        print_frame_info()
-        self.dataset = dataset
-        self.group_size = group_size
-        self.random_seed = random_seed
-        self.shuffle = shuffle
-
-        self.rank = get_rank()
-        self.local_rank = get_local_rank()
-        self.world_size = get_world_size()
-        self.local_size = get_local_size()
-
-        self.node_size = self.world_size // self.local_size
-        self.node_idx = self.rank // self.local_size
-
-        self.shuffle_group_process = None
-
-        self.prepare_process = None
-        self.prepare_queue = None
-        self.prepare_files = None
-        # currently, we only support to prepare one kind of files, but it could
-        # be extendeed to multiple files if we need
-        self.prepare_t_versions = prepare_t_versions
-        self.sub_process_create_shuffle = False
-        self._idx_split = None
-        self.iter_shuffle_group = None
-
-        self.curr_group_buffers = None
-        self.next_group_index = 0
-        self.cache_group_index_on_node = None
-
-        self.disable_prepare = disable_prepare
-        self.get_group_process = None
-        self.fixed_samples_in_node = fixed_samples_in_node
-
-    def get_composite_source_idx(self):
-        return self.dataset.get_composite_source_idx()
-
-    def get_composite_source_files(self):
-        data = self.dataset.dataset.data
-        split = self.dataset.dataset.split
-        dataset = TSVDataset(data)
-        result = []
-        for t, version in self.prepare_t_versions:
-            tsv = dataset.get_data(split, t, version)
-            if op.isfile(tsv):
-                result.append([tsv])
-            else:
-                x_tsv = dataset.get_data(split + 'X', t, version)
-                assert op.isfile(x_tsv)
-                result.append(load_list_file(x_tsv))
-        return result
-
-    def load_idx_split(self):
-        logging.info('loading source list')
-        source_list = self.get_composite_source_idx()
-        logging.info('loaded source list')
-        idx_split = list(enumerate(source_list))
-        idx_split = torch.tensor(idx_split)
-        return idx_split
-
-    @property
-    def idx_split(self):
-        if self._idx_split is None:
-            self._idx_split = self.load_idx_split()
-            self._idx_split.share_memory_()
-        return self._idx_split
-
-    def get_shufle_idx(self, n):
-        g = torch.Generator()
-        g.manual_seed(self.random_seed)
-        random_idx = torch.randperm(n, generator=g)
-        self.random_seed += 99
-        return random_idx
-
-    def get_group_index_on_node_random(self):
-        idx_split = self.idx_split
-
-        max_split = idx_split[:, 1].max() + 1
-        priority = self.get_shufle_idx(max_split)
-
-        random_idx = self.get_shufle_idx(len(idx_split))
-        idx_split = idx_split[random_idx]
-
-        idx_split = torch.cat(
-            [idx_split[idx_split[:, 1] == p] for p in priority])
-
-        num_idx_on_node = (len(idx_split) + self.node_size -
-                           1) // self.node_size
-        offset = num_idx_on_node * self.node_idx
-        offset_end = offset + num_idx_on_node
-        offset_end = min(offset_end, len(idx_split))
-        idx_split = idx_split[offset:offset_end]
-
-        unique_split_index = ordered_unique(idx_split[:, 1].tolist())
-        logging.info(unique_split_index)
-        result = [{
-            'idx_in_group':
-            idx_split[idx_split[:, 1] == s][:, 0].tolist(),
-            'split_in_group':
-            s,
-        } for s in unique_split_index]
-        return result
-
-    def get_group_index_on_node(self):
-        if self.shuffle and not self.fixed_samples_in_node:
-            return self.get_group_index_on_node_random()
-        elif self.shuffle and self.fixed_samples_in_node:
-            if self.cache_group_index_on_node is None:
-                self.cache_group_index_on_node = self.get_group_index_on_node_random(
-                )
-            idx = self.get_shufle_idx(len(self.cache_group_index_on_node))
-            group_in_node = [self.cache_group_index_on_node[i] for i in idx]
-            for g in group_in_node:
-                idx = self.get_shufle_idx(len(g['idx_in_group']))
-                g['idx_in_group'] = [g['idx_in_group'][i] for i in idx]
-            return group_in_node
-        else:
-            if self.cache_group_index_on_node is None:
-                self.cache_group_index_on_node = self.get_group_index_on_node_random(
-                )
-            return self.cache_group_index_on_node
-
-    def get_next_group_index_on_node(self):
-        if self.curr_group_buffers is None:
-            self.curr_group_buffers = self.get_group_index_on_node()
-            self.next_group_index = 0
-        if self.next_group_index >= len(self.curr_group_buffers):
-            self.curr_group_buffers = self.get_group_index_on_node()
-            self.next_group_index = 0
-        g = self.curr_group_buffers[self.next_group_index]
-        self.next_group_index += 1
-        return g
-
-    def get_group_thread(self, q):
-        while True:
-            if q.qsize() < 8:
-                g = self.get_next_group_index_on_node()
-                q.put(g)
-            else:
-                time.sleep(1)
-
-    def __iter__(self):
-        use_thread_to_get_group = True
-        if not use_thread_to_get_group:
-            group_buffers = [
-                self.get_next_group_index_on_node() for _ in range(4)
-            ]
-            if self.local_rank == 0:
-                for g in group_buffers:
-                    self.prepare(g['split_in_group'])
-            assert len(group_buffers) > 0
-            idx = self.local_rank
-            while True:
-                while idx >= len(group_buffers[0]['idx_in_group']):
-                    idx -= len(group_buffers[0]['idx_in_group'])
-                    group_buffers.pop(0)
-                    new_g = self.get_next_group_index_on_node()
-                    if self.local_rank == 0:
-                        self.prepare(new_g['split_in_group'])
-                    group_buffers.append(new_g)
-                r = group_buffers[0]['idx_in_group'][idx]
-                yield r
-                idx += self.local_size
-        else:
-            self.ensure_init_get_group_thread()
-            group_buffers = [self.get_group_queue.get() for _ in range(4)]
-            if self.local_rank == 0:
-                for g in group_buffers:
-                    self.prepare(g['split_in_group'])
-            assert len(group_buffers) > 0
-            idx = self.local_rank
-            while True:
-                while idx >= len(group_buffers[0]['idx_in_group']):
-                    idx -= len(group_buffers[0]['idx_in_group'])
-                    group_buffers.pop(0)
-                    start = time.time()
-                    new_g = self.get_group_queue.get()
-                    cost = time.time() - start
-                    logging.info(
-                        'time to get group index on node: {}'.format(cost))
-                    if self.local_rank == 0:
-                        self.prepare(new_g['split_in_group'])
-                    group_buffers.append(new_g)
-                r = group_buffers[0]['idx_in_group'][idx]
-                yield r
-                idx += self.local_size
-
-    def ensure_init_get_group_thread(self):
-        if self.get_group_process is None:
-            import queue
-            import threading
-            q = queue.Queue()
-            t = threading.Thread(
-                target=self.get_group_thread,
-                args=(q, ),
-                daemon=True,
-            )
-            t.start()
-            self.get_group_process = t
-            self.get_group_queue = q
-
-    def ensure_init_prepare(self):
-        if self.prepare_files is None:
-            self.prepare_files = self.get_composite_source_files()
-        if self.prepare_process is None:
-            max_len = 8 if not self.fixed_samples_in_node else 0
-            p, prepare_queue = create_prepare_tsv_file_process(max_len=max_len)
-            self.prepare_process = p
-            self.prepare_queue = prepare_queue
-
-    def prepare(self, split):
-        if self.disable_prepare:
-            return
-        self.ensure_init_prepare()
-        q = self.prepare_queue
-        size = q.qsize()
-        if size > 100:
-            logging.info('prepare queue is too long {}'.format(size))
-        q.put([ps[split] for ps in self.prepare_files])
-
-    def __len__(self):
-        raise ValueError('should not be called')
-
-
-class AttachIterationNumberBatchSampler(object):
-
-    def __init__(self,
-                 batch_sampler,
-                 start_iter,
-                 num_iters,
-                 gradient_accumulate=1):
-        self.batch_sampler = batch_sampler
-        self.curr_iter = start_iter
-        self.max_iter = num_iters
-        self.gradient_accumulate = gradient_accumulate
-
-    def __getattr__(self, att):
-        return getattr(self.batch_sampler, att)
-
-    def __iter__(self):
-        #if hasattr(self.batch_sampler, 'skip') and self.curr_iter > 0:
-        #logging.info('we will skip {} batches'.format(self.curr_iter))
-        #self.batch_sampler.skip(self.curr_iter)
-        for idx_batch, batch in enumerate(self.batch_sampler):
-            batch = [{
-                'iteration': self.curr_iter,
-                'idx': i,
-                'max_iter': self.max_iter
-            } for i in batch]
-            yield batch
-            if (idx_batch + 1) % self.gradient_accumulate == 0:
-                self.curr_iter += 1
-
-    def __len__(self):
-        return len(self.batch_sampler)
-
-
-class OrderedSplitSampler(Sampler):
-
-    def __init__(self, data_length):
-        curr_rank = get_rank()
-        world_size = get_world_size()
-        rank_size = (data_length + world_size - 1) // world_size
-        start = rank_size * curr_rank
-        end = start + rank_size
-        assert start >= 0 and start <= data_length
-        if curr_rank < world_size - 1:
-            assert end >= 0 and end <= data_length
-        end = min(end, data_length)
-        self.start = start
-        self.end = end
-
-    def __iter__(self):
-        return iter(range(self.start, self.end))
-
-    def __len__(self):
-        return self.end - self.start
-
-
-class BatchSampler(Sampler):
-    r"""Wraps another sampler to yield a mini-batch of indices.
-
-    Args:
-        sampler (Sampler): Base sampler.
-        batch_size (int): Size of mini-batch.
-        drop_last (bool): If ``True``, the sampler will drop the last batch if
-            its size would be less than ``batch_size``
-
-    Example:
-        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
-        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
-        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
-    """
-
-    def __init__(self, sampler, batch_size, drop_last):
-        if not isinstance(sampler, Sampler):
-            raise ValueError(
-                "sampler should be an instance of "
-                "torch.utils.data.Sampler, but got sampler={}".format(sampler))
-        if not isinstance(drop_last, bool):
-            raise ValueError("drop_last should be a boolean value, but got "
-                             "drop_last={}".format(drop_last))
-        self.sampler = sampler
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-
-    def __iter__(self):
-        batch = []
-        for idx in self.sampler:
-            batch.append(idx)
-            if len(batch) == self.batch_size:
-                yield batch
-                batch = []
-        if len(batch) > 0 and not self.drop_last:
-            yield batch
-
-    def __len__(self):
-        if self.drop_last:
-            return len(self.sampler) // self.batch_size
-        else:
-            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
-
-
-class IterationBasedBatchSampler(BatchSampler):
-    """
-    Wraps a BatchSampler, resampling from it until
-    a specified number of iterations have been sampled
-    """
-
-    def __init__(
-        self,
-        batch_sampler,
-        num_iterations,
-        start_iter=0,
-    ):
-        self.batch_sampler = batch_sampler
-        self.num_iterations = num_iterations
-        self.start_iter = start_iter
-
-        if hasattr(batch_sampler, 'batch_size'):
-            self.batch_size = batch_sampler.batch_size
-
-        if hasattr(batch_sampler, 'drop_last'):
-            self.drop_last = batch_sampler.drop_last
-
-    def __iter__(self):
-        iteration = self.start_iter
-        while iteration <= self.num_iterations:
-            # if the underlying sampler has a set_epoch method, like
-            # DistributedSampler, used for making each process see
-            # a different split of the dataset, then set it
-            if hasattr(self.batch_sampler.sampler, "set_epoch"):
-                self.batch_sampler.sampler.set_epoch(iteration)
-            for batch in self.batch_sampler:
-                iteration += 1
-                if iteration > self.num_iterations:
-                    break
-                yield batch
-
-    def __len__(self):
-        return self.num_iterations
-
-
-class DynamicBatchSampler(BatchSampler):
-
-    def __init__(self, sampler, get_batch_size, start_iter=0):
-        self.sampler = sampler
-        self.get_batch_size = get_batch_size
-        self.start_iter = start_iter
-
-    def __iter__(self):
-        batch = []
-        batch_size = None
-        curr_iter = self.start_iter
-        for idx in self.sampler:
-            batch.append(idx)
-            if batch_size is None:
-                batch_size = self.get_batch_size(curr_iter)
-            if len(batch) == batch_size:
-                yield batch
-                batch_size = None
-                curr_iter += 1
-                batch = []
-
-
-class DistributedSampler(Sampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-    It is especially useful in conjunction with
-    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
-    process can pass a DistributedSampler instance as a DataLoader sampler,
-    and load a subset of the original dataset that is exclusive to it.
-    .. note::
-        Dataset is assumed to be of constant size.
-    Arguments:
-        dataset: Dataset used for sampling.
-        num_replicas (optional): Number of processes participating in
-            distributed training.
-        rank (optional): Rank of the current process within num_replicas.
-    """
-
-    def __init__(self,
-                 dataset,
-                 num_replicas=None,
-                 rank=None,
-                 shuffle=True,
-                 length_divisible=1):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            num_replicas = get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError(
-                    "Requires distributed package to be available")
-            rank = get_rank()
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        self.num_samples = int(
-            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
-        if length_divisible > 1:
-            import logging
-            logging.info('before making divisible = {}'.format(
-                self.num_samples))
-            self.num_samples = ((self.num_samples + length_divisible - 1) //
-                                length_divisible) * length_divisible
-            logging.info('adjust to = {}'.format(self.num_samples))
-        self.total_size = self.num_samples * self.num_replicas
-        self.shuffle = shuffle
-
-    def __iter__(self):
-        if self.shuffle:
-            # deterministically shuffle based on epoch
-            g = torch.Generator()
-            g.manual_seed(self.epoch)
-            indices = torch.randperm(len(self.dataset), generator=g).tolist()
-        else:
-            indices = torch.arange(len(self.dataset)).tolist()
-
-        # add extra samples to make it evenly divisible
-        assert (self.total_size -
-                len(indices)) <= len(indices), 'not implemented'
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-
-        # subsample
-        offset = self.num_samples * self.rank
-        indices = indices[offset:offset + self.num_samples]
-        assert len(indices) == self.num_samples
-
-        return iter(indices)
-
-    def __len__(self):
-        return self.num_samples
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
diff --git a/AVLFormer/src/datasets/vision_language_tsv.py b/AVLFormer/src/datasets/vision_language_tsv.py
deleted file mode 100644
index 49a68a1..0000000
--- a/AVLFormer/src/datasets/vision_language_tsv.py
+++ /dev/null
@@ -1,520 +0,0 @@
-"""
-Copyright (c) Microsoft Corporation.
-Licensed under the MIT license.
-
-"""
-import io
-import json
-import os.path as op
-
-from PIL import Image
-import av
-import h5py
-import numpy as np
-from src.utils.load_files import (
-    find_file_path_in_yaml,
-    load_box_linelist_file,
-    load_from_yaml_file,
-)
-from src.utils.logger import LOGGER
-from src.utils.tsv_file import CompositeTSVFile, TSVFile
-from src.utils.tsv_file_ops import tsv_reader
-import torch
-
-from .data_utils.image_ops import img_from_base64
-from .data_utils.video_ops import extract_frames_from_video_path
-
-# video_transforms & volume_transforms from https://github.com/hassony2/torch_videovision
-from .data_utils.video_transforms import (
-    CenterCrop,
-    Compose,
-    Normalize,
-    RandomCrop,
-    Resize,
-)
-from .data_utils.volume_transforms import ClipToTensor
-
-
-class VisionLanguageTSVDataset(object):
-
-    def __init__(self,
-                 args,
-                 yaml_file,
-                 tokenizer,
-                 tensorizer=None,
-                 is_train=True,
-                 on_memory=False):
-
-        self.args = args
-        self.tokenizer = tokenizer
-        self.tensorizer = tensorizer
-
-        self.yaml_file = yaml_file
-        self.root = op.dirname(yaml_file)
-
-        self.cfg = load_from_yaml_file(yaml_file)
-        self.is_composite = self.cfg.get('composite', False)
-        self.cap_linelist_file = find_file_path_in_yaml(
-            self.cfg.get('caption_linelist', None), self.root)
-
-        # self.cfg =
-        # {'caption': 'train.caption.tsv',
-        #  'caption_coco_format': 'train.caption_coco_format.json',
-        #  'caption_linelist': 'train.caption.linelist.tsv',
-        #  'img': 'frame_tsv/train_32frames.img.tsv',
-        #  'label': 'train.label.tsv'
-        #  'audio': 'train_mp3.hdf'}
-
-        hdf5_file = op.join(self.root, self.cfg.get('audio', None))
-        assert hdf5_file != None
-
-        self.att_mode = args.att_mode
-
-        if is_train and args.text_mask_type == 'pos_tag':
-            jfile = open(op.join(self.root, self.cfg.get('ner', None)), 'r')
-            self.ner = json.load(jfile)
-            jfile.close()
-
-        with open(hdf5_file, 'rb') as f:
-            self.hdf5_file = io.BytesIO(f.read())
-        self.sample_rate = 32000
-        self.clip_length = 10 * self.sample_rate
-        self.audio_file = None
-
-        self.visual_file = self.cfg.get('img', None)
-        self.visual_tsv = self.get_tsv_file(self.visual_file)
-
-        self.label_file = self.cfg.get('label', None)
-        self.label_tsv = self.get_tsv_file(self.label_file)
-
-        self.cap_file = self.cfg.get('caption', None)
-        self.cap_tsv = self.get_tsv_file(self.cap_file)
-
-        if self.is_composite:
-            assert op.isfile(self.cap_linelist_file)
-            self.cap_line_list = [
-                int(row[2]) for row in tsv_reader(self.cap_linelist_file)
-            ]
-            self.img_line_list = [i for i in range(len(self.cap_line_list))]
-
-        # True
-        elif self.cap_linelist_file:
-            line_list = load_box_linelist_file(self.cap_linelist_file)
-            self.img_line_list = line_list[0]
-            self.cap_line_list = line_list[1]
-        else:
-            # one caption per image/video
-            self.img_line_list = [i for i in range(self.label_tsv.num_rows())]
-            self.cap_line_list = [0 for i in range(self.label_tsv.num_rows())]
-
-        if is_train:
-            assert self.cap_tsv is not None
-            assert tokenizer is not None
-
-        self.is_train = is_train
-        self.image_keys = self.prepare_image_keys()
-        self.key2index = self.prepare_image_key_to_index()
-        self.on_memory = on_memory
-        if on_memory:
-            if self.cap_tsv is not None:
-                self.load_caption_to_memory()
-
-        self.is_train = is_train
-        self.img_res = getattr(args, 'img_res', 224)
-        self.patch_size = getattr(args, 'patch_size', 16)
-
-        self.img_feature_dim = args.img_feature_dim
-        self.decoder_target_fps = 3
-        self.decoder_num_frames = getattr(args, 'max_num_frames', 2)
-        self.decoder_multi_thread_decode = False
-
-        self.decoder_safeguard_duration = False
-        self.add_od_labels = getattr(args, 'add_od_labels', False)
-        self.use_asr = getattr(args, 'use_asr', False)
-
-        LOGGER.info(f'Use_asr: {self.use_asr}')
-        # use uniform sampling as default for now
-        self.decoder_sampling_strategy = getattr(args,
-                                                 'decoder_sampling_strategy',
-                                                 'uniform')
-        LOGGER.info(f'isTrainData: {self.is_train}\n[PyAV video parameters] '
-                    f'Num of Frame: {self.decoder_num_frames}, '
-                    f'FPS: {self.decoder_target_fps}, '
-                    f'Sampling: {self.decoder_sampling_strategy}')
-        # Initialize video transforms
-        # adapt from https://github.com/hassony2/torch_videovision
-
-        if is_train == True:
-            self.raw_video_crop_list = [
-                Resize(self.img_res),
-                RandomCrop((self.img_res, self.img_res)),
-                ClipToTensor(channel_nb=3),
-                Normalize(mean=[0.485, 0.456, 0.406],
-                          std=[0.229, 0.224, 0.225])
-            ]
-        else:
-            self.raw_video_crop_list = [
-                Resize(self.img_res),
-                CenterCrop((self.img_res, self.img_res)),
-                ClipToTensor(channel_nb=3),
-                Normalize(mean=[0.485, 0.456, 0.406],
-                          std=[0.229, 0.224, 0.225])
-            ]
-        self.raw_video_prcoess = Compose(self.raw_video_crop_list)
-
-    def open_hdf5(self):
-        self.audio_file = h5py.File(self.hdf5_file, 'r')
-
-    def decode_mp3(self, mp3_arr):
-        """
-        decodes an array if uint8 representing an mp3 file
-        :rtype: np.array
-        """
-        container = av.open(io.BytesIO(mp3_arr.tobytes()))
-        stream = next(s for s in container.streams if s.type == 'audio')
-        a = []
-        for _, packet in enumerate(container.demux(stream)):
-            for frame in packet.decode():
-                a.append(frame.to_ndarray().reshape(-1))
-        waveform = np.concatenate(a)
-        if waveform.dtype != 'float32':
-            raise RuntimeError('Unexpected wave type')
-        return waveform
-
-    def pydub_augment(self, waveform, gain_augment=7, ir_augment=0):
-        if gain_augment:
-            gain = torch.randint(gain_augment * 2, (1, )).item() - gain_augment
-            amp = 10**(gain / 20)
-            waveform = waveform * amp
-        return waveform
-
-    def pad_or_truncate(self, x, audio_length):
-        """Pad all audio to specific length."""
-        if len(x) <= audio_length:
-            return np.concatenate(
-                (x, np.zeros(audio_length - len(x), dtype=np.float32)), axis=0)
-        else:
-            return x[0:audio_length]
-
-    def roll_func(self, x, axis=1, shift=None, shift_range=50):
-        x = torch.as_tensor(x)
-        sf = shift
-        if shift is None:
-            sf = int(np.random.randint(-shift_range, shift_range))
-
-        return x.roll(sf, axis)
-
-    def __del__(self):
-        if self.audio_file is not None:
-            self.audio_file.close()
-            self.audio_file = None
-
-    def get_composite_source_idx(self):
-        if self.is_composite:
-            assert op.isfile(self.cap_linelist_file)
-            self.composite_source_idx = [
-                int(row[0]) for row in tsv_reader(self.cap_linelist_file)
-            ]
-        else:
-            # only a single tsv file is used as input
-            self.composite_source_idx = [
-                0 for _ in range(len(self.cap_line_list))
-            ]
-        return self.composite_source_idx
-
-    def get_tsv_file(self, tsv_file):
-        if tsv_file:
-            if self.is_composite:
-                return CompositeTSVFile(tsv_file,
-                                        self.cap_linelist_file,
-                                        root=self.root)
-            tsv_path = find_file_path_in_yaml(tsv_file, self.root)
-            return TSVFile(tsv_path)
-
-    def load_caption_to_memory(self):
-        self.caption_on_memory = {}
-        for img_idx in set(self.img_line_list):
-            row = self.get_row_from_tsv(self.cap_tsv, img_idx)
-            for cap_idx, data in enumerate(json.loads(row[1])):
-                self.caption_on_memory[(img_idx, cap_idx)] = data['caption']
-
-    def get_valid_tsv(self):
-        if self.is_train:
-            return self.cap_tsv
-        # sorted by file size
-        if self.cap_tsv:
-            return self.cap_tsv
-        if self.visual_tsv:
-            return self.visual_tsv
-
-    def prepare_image_keys(self):
-        tsv = self.get_valid_tsv()
-        return [tsv.get_key(i) for i in range(tsv.num_rows())]
-
-    def prepare_image_key_to_index(self):
-        tsv = self.get_valid_tsv()
-        return {tsv.get_key(i): i for i in range(tsv.num_rows())}
-
-    def get_image_cap_index(self, idx):
-        return self.img_line_list[idx], self.cap_line_list[idx]
-
-    def get_row_from_tsv(self, tsv, img_idx):
-        row = tsv[img_idx]
-        if self.is_composite:
-            assert self.image_keys[img_idx].endswith(row[0])
-        else:
-            assert row[0] == self.image_keys[img_idx]
-        return row
-
-    def get_caption(self, img_idx, cap_idx):
-        if self.is_train:
-            if self.on_memory:
-                return self.caption_on_memory[(img_idx, cap_idx)]
-            row = self.get_row_from_tsv(self.cap_tsv, img_idx)
-            return json.loads(row[1])[cap_idx]['caption']
-        return ""
-
-    def get_caption_and_timeinfo(self, data, cap_idx):
-        caption, tag, start, end = '', ' ', None, None
-        data_sample = data[cap_idx]
-        if self.is_train:
-            caption = data_sample['caption']
-            if 'start' in data_sample.keys():
-                start = data_sample['start']
-            if 'end' in data_sample.keys():
-                end = data_sample['end']
-            if 'asr' in data_sample.keys() and self.use_asr:
-                asr = data_sample['asr'].lower()
-                tag = asr
-        else:
-            if 'start' in data_sample.keys():
-                start = data_sample['start']
-            if 'end' in data_sample.keys():
-                end = data_sample['end']
-            if 'asr' in data_sample.keys() and self.use_asr:
-                asr = data_sample['asr'].lower()
-                tag = asr
-        return caption, tag, start, end
-
-    def get_caption_and_timeinfo_wrapper(self, img_idx, cap_idx):
-        row = self.get_row_from_tsv(self.cap_tsv, img_idx)
-        data_sample = json.loads(row[1])
-        caption, asr_or_tag, start, end = self.get_caption_and_timeinfo(
-            data_sample, cap_idx)
-        return caption, asr_or_tag, start, end
-
-    def get_caption_file_in_coco_format(self):
-        # for evaluation
-        cap_file_coco_format = find_file_path_in_yaml(
-            self.cfg.get('caption_coco_format', None), self.root)
-        if cap_file_coco_format:
-            return cap_file_coco_format
-        test_split = op.basename(self.yaml_file).split('.')[0]
-        return op.join(self.root, test_split + '_caption_coco_format.json')
-
-    def get_captions_by_key(self, key):
-        # get a list of captions for image (by key)
-        img_idx = self.key2index[key]
-        cap_info = json.loads(self.cap_tsv[img_idx][1])
-        return [c['caption'] for c in cap_info]
-
-    def get_video_key(self, idx):
-        # line_no = self.get_line_no(idx)
-        # return self.label_tsv[line_no][0]
-        return self.get_row_from_tsv(self.label_tsv, idx)[0]
-
-    def apply_augmentations(self, frames):
-        # if failed to decode video, generate fake frames (should be corner case)
-        if frames is None:
-            frames = np.zeros((self.decoder_num_frames, self.img_res,
-                               self.img_res, 3)).astype(np.uint8)
-        # (T, C, H, W) -> (T, H, W, C), channel is RGB
-        elif 'torch' in str(frames.dtype):
-            frames = frames.numpy()
-            frames = np.transpose(frames, (0, 2, 3, 1))
-        else:
-            frames = frames.astype(np.uint8)
-            frames = np.transpose(frames, (0, 2, 3, 1))
-        num_of_frames, height, width, channels = frames.shape
-
-        frame_list = []
-        for i in range(self.decoder_num_frames):
-            if num_of_frames == 1:
-                # if it is from image-caption dataset, we duplicate the image
-                # convert numpy to PIL format, compatible to augmentation operations
-                frame_list.append(Image.fromarray(frames[0]))
-            else:
-                # if it is from video-caption dataset, we add each frame to the list
-                # convert numpy to PIL format, compatible to augmentation operations
-                frame_list.append(Image.fromarray(frames[i]))
-
-        # adapt from torch_videovision: https://github.com/hassony2/torch_videovision
-        # after augmentation, output tensor (C x T x H x W) in the range [0, 1.0]
-        crop_frames = self.raw_video_prcoess(frame_list)
-        # (C x T x H x W) --> (T x C x H x W)
-        crop_frames = crop_frames.permute(1, 0, 2, 3)
-        return crop_frames
-
-    def get_image(self, bytestring):
-        # output numpy array (T, C, H, W), channel is RGB, T = 1
-        cv2_im = img_from_base64(bytestring)
-        cv2_im = cv2_im[:, :, ::-1]  # COLOR_BGR2RGB
-        # cv2_im = cv2.cvtColor(cv2_im, cv2.COLOR_BGR2RGB)
-        output = np.transpose(cv2_im[np.newaxis, ...], (0, 3, 1, 2))
-        return output
-
-    def get_frames_from_tsv(self, binary_frms):
-        # get pre-extracted video frames from tsv files
-        frames = []
-        _C, _H, _W = 3, 224, 224
-        if self.decoder_num_frames > len(binary_frms):
-            print(
-                f"Corrupt videos, requested {self.decoder_num_frames} frames, "
-                f"but got only {len(binary_frms)} frames, will return all zeros instead"
-            )
-            return np.zeros((self.decoder_num_frames, _C, _H, _W),
-                            dtype=np.int64)
-
-        def sampling(start, end, n):
-            if n == 1:
-                return [int(round((start + end) / 2.))]
-            if n < 1:
-                raise Exception("behaviour not defined for n<2")
-            step = (end - start) / float(n - 1)
-            return [int(round(start + x * step)) for x in range(n)]
-
-        for i in sampling(0, len(binary_frms) - 1, self.decoder_num_frames):
-            try:
-                image = self.get_image(binary_frms[i])
-            except Exception as e:
-                print(f"Corrupt frame at {i}")
-                image = np.zeros((1, _C, _H, _W), dtype=np.int64)
-            _, _C, _H, _W = image.shape
-            frames.append(image)
-        return np.vstack(frames)
-
-    def decode_and_get_frames(self, clip_path_name, start=None, end=None):
-        # online decode raw video file, and get video frames
-        # output tensor (T, C, H, W), channel is RGB, T = self.decoder_num_frames
-        if 'TVC' in clip_path_name:
-            # default clip_path_name: datasets/TVC/videos/{tv_show}/{tv_show}_clips/{tv_show}_{seasoninfo}/{video_id}.mp4_{start_time}_{end_time}
-            # To load video file, we will need to remove start&end info here
-            resolved_video_path = '_'.join(clip_path_name.split('_')[0:-2])
-        else:  # VATEX, MSVD, MSRVTT, Youcook2
-            resolved_video_path = clip_path_name
-        frames, video_max_pts = extract_frames_from_video_path(
-            resolved_video_path, self.decoder_target_fps,
-            self.decoder_num_frames, self.decoder_multi_thread_decode,
-            self.decoder_sampling_strategy, self.decoder_safeguard_duration,
-            start, end)
-        return frames
-
-    def get_visual_data(self, idx, start=None, end=None):
-        row = self.get_row_from_tsv(self.visual_tsv, idx)
-        # if the input is a video tsv with only video file paths,
-        # extract video frames on-the-fly, and return a video-frame tensor
-        if row[0] == row[-1]:
-            return self.decode_and_get_frames(row[-1], start, end), True
-        # if the input is a video tsv with frames pre-extracted,
-        # return a video-frame tensor
-        elif len(row) >= self.decoder_num_frames + 2:
-            return self.get_frames_from_tsv(row[2:]), True
-        # if the input is a image tsv, return image numpy array
-        else:
-            return self.get_image(row[-1]), False
-
-    def __len__(self):
-        return len(self.img_line_list)
-
-    def __getitem__(self, idx):
-        if self.args.debug_speed:
-            idx = idx % self.args.effective_batch_size
-
-        # audio part
-        if self.audio_file is None:
-            self.open_hdf5()
-        audio_name = self.audio_file['audio_name'][idx].decode()
-        wave_form = self.decode_mp3(self.audio_file['mp3'][idx])
-        wave_form = self.pydub_augment(waveform=wave_form)
-        wave_form = self.pad_or_truncate(x=wave_form,
-                                         audio_length=self.clip_length)
-
-        if self.is_train:
-            wave_form = self.roll_func(x=wave_form.reshape(1, -1))
-        else:
-            wave_form = wave_form.reshape(1, -1)
-
-        img_idx, cap_idx = self.get_image_cap_index(idx)
-
-        img_key = self.image_keys[img_idx]
-
-        # data consistency check
-        assert audio_name == img_key.split(
-            '/')[-1][:-4], f'audio:{audio_name}, video:{img_key}'
-
-        caption_sample, tag, start, end = self.get_caption_and_timeinfo_wrapper(
-            img_idx, cap_idx)
-        # tag = ' ' start = None end = None is_video = True
-        # get image or video frames
-        # frames: (T, C, H, W),  is_video: binary tag
-        raw_frames, is_video = self.get_visual_data(img_idx, start, end)
-
-        # apply augmentation. frozen-in-time if the input is an image
-        # preproc_frames: (T, C, H, W), C = 3, H = W = self.img_res, channel is RGB
-        preproc_frames = self.apply_augmentations(raw_frames)
-
-        # tokenize caption and generate attention maps
-        # it will consider only # of visual tokens for building attention maps. # is args.max_img_seq_length
-        if isinstance(caption_sample, dict):
-            caption = caption_sample["caption"]
-        else:
-            caption = caption_sample
-            caption_sample = None
-
-        if self.is_train and self.args.text_mask_type == "pos_tag":
-            if caption_sample is None:
-                caption_sample = dict()
-            caption_sample['bert_pos_tag'] = self.ner[audio_name]
-
-        # add_od_labels = False
-        if self.args.add_od_labels == True:
-            example = self.tensorizer.tensorize_example_e2e(
-                caption,
-                preproc_frames,
-                wave_form,
-                text_b=tag,
-                text_meta=caption_sample,
-                mode=self.att_mode)
-        else:
-            example = self.tensorizer.tensorize_example_e2e(
-                caption,
-                preproc_frames,
-                wave_form,
-                text_meta=caption_sample,
-                mode=self.att_mode)
-
-        # preparing outputs
-        meta_data = {}
-        meta_data['caption'] = caption  # raw text data, not tokenized
-        meta_data['img_key'] = img_key
-        meta_data['is_video'] = is_video  # True: video data, False: image data
-        meta_data['tag'] = tag
-
-        return img_key, example, meta_data
-
-
-class VisionLanguageTSVYamlDataset(VisionLanguageTSVDataset):
-    """ TSVDataset taking a Yaml file for easy function call
-    """
-
-    def __init__(self,
-                 args,
-                 yaml_file,
-                 tokenizer,
-                 tensorizer=None,
-                 is_train=True,
-                 on_memory=False):
-        # print('Init video/image captioning dataloader...')
-        super(VisionLanguageTSVYamlDataset,
-              self).__init__(args, yaml_file, tokenizer, tensorizer, is_train,
-                             on_memory)
diff --git a/AVLFormer/src/datasets/vl_dataloader.py b/AVLFormer/src/datasets/vl_dataloader.py
deleted file mode 100644
index 3e994a0..0000000
--- a/AVLFormer/src/datasets/vl_dataloader.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-Copyright (c) Microsoft Corporation.
-Licensed under the MIT license.
-
-"""
-import os.path as op
-
-from src.utils.comm import get_world_size
-from src.utils.logger import LOGGER as logger
-import torch
-
-from .caption_tensorizer import build_tensorizer
-from .data_sampler import DistributedSamplerLimited, NodeSplitSampler
-from .vision_language_tsv import VisionLanguageTSVYamlDataset
-
-
-def build_dataset(args, yaml_file, tokenizer, is_train=True):
-    logger.info(f'yaml_file:{yaml_file}')
-    if not op.isfile(yaml_file):
-        yaml_file = op.join(args.data_dir, yaml_file)
-        assert op.isfile(yaml_file), f"{yaml_file} does not exists"
-    tensorizer = build_tensorizer(args, tokenizer, is_train=is_train)
-    dataset_class = VisionLanguageTSVYamlDataset
-    return dataset_class(args, yaml_file, tokenizer, tensorizer, is_train,
-                         args.on_memory)
-
-
-class IterationBasedBatchSampler(torch.utils.data.sampler.BatchSampler):
-    """
-    Wraps a BatchSampler, resampling from it until
-    a specified number of iterations have been sampled
-    """
-
-    def __init__(self, batch_sampler, num_iterations, start_iter=0):
-        self.batch_sampler = batch_sampler
-        self.num_iterations = num_iterations
-        self.start_iter = start_iter
-
-    def __iter__(self):
-        iteration = self.start_iter
-        while iteration <= self.num_iterations:
-            # if the underlying sampler has a set_epoch method, like
-            # DistributedSampler, used for making each process see
-            # a different split of the dataset, then set it
-            if hasattr(self.batch_sampler.sampler, "set_epoch"):
-                self.batch_sampler.sampler.set_epoch(iteration)
-            for batch in self.batch_sampler:
-                iteration += 1
-                if iteration > self.num_iterations:
-                    break
-                yield batch
-
-    def __len__(self):
-        return self.num_iterations
-
-
-def make_batch_data_sampler(sampler,
-                            images_per_gpu,
-                            num_iters=None,
-                            start_iter=0):
-    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler,
-                                                          images_per_gpu,
-                                                          drop_last=False)
-    if num_iters is not None and num_iters >= 0:
-        batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iters,
-                                                   start_iter)
-    return batch_sampler
-
-
-def make_data_sampler(dataset,
-                      shuffle,
-                      distributed,
-                      random_seed,
-                      limited_samples=-1):
-    if distributed:
-        if dataset.is_composite:
-            # first_epoch_skip_shuffle not working yet
-            logger.info(
-                "Enable NodeSplitSampler with first_epoch_skip_shuffle=True")
-            return NodeSplitSampler(dataset,
-                                    shuffle=shuffle,
-                                    random_seed=random_seed,
-                                    first_epoch_skip_shuffle=True)
-        elif limited_samples < 1:
-            return torch.utils.data.distributed.DistributedSampler(
-                dataset, shuffle=shuffle, seed=random_seed)
-        else:  # use limited distributed sampler
-            return DistributedSamplerLimited(dataset,
-                                             shuffle=shuffle,
-                                             limited=limited_samples)
-    if shuffle:
-        sampler = torch.utils.data.sampler.RandomSampler(dataset)
-    else:
-        sampler = torch.utils.data.sampler.SequentialSampler(dataset)
-    return sampler
-
-
-def make_data_loader(args,
-                     yaml_file,
-                     tokenizer,
-                     is_distributed=True,
-                     is_train=True,
-                     start_iter=0,
-                     num_gpus=8):
-
-    dataset = build_dataset(args, yaml_file, tokenizer, is_train=is_train)
-    if is_train == True:
-        shuffle = True
-        images_per_gpu = args.per_gpu_train_batch_size
-        images_per_batch = images_per_gpu * get_world_size()
-        iters_per_batch = len(dataset) // images_per_batch
-        num_iters = iters_per_batch * args.num_train_epochs
-        logger.info("Train with {} images per GPU.".format(images_per_gpu))
-        logger.info("Total batch size {}".format(images_per_batch))
-        logger.info("Total training steps {}".format(num_iters))
-    else:
-        shuffle = False
-        images_per_gpu = args.per_gpu_eval_batch_size
-        num_iters = None
-        start_iter = 0
-
-    if hasattr(args, 'limited_samples'):
-        limited_samples = args.limited_samples // num_gpus
-    else:
-        limited_samples = -1
-    random_seed = args.seed
-    sampler = make_data_sampler(dataset,
-                                shuffle,
-                                is_distributed,
-                                limited_samples=limited_samples,
-                                random_seed=random_seed)
-    batch_sampler = make_batch_data_sampler(sampler, images_per_gpu, num_iters,
-                                            start_iter)
-    data_loader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=args.num_workers,
-        batch_sampler=batch_sampler,
-        pin_memory=True,
-        worker_init_fn=init_seeds,
-    )
-    return data_loader
-
-
-def init_seeds(seed=88):
-    import os
-    import random
-    random.seed(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    import numpy as np
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
\ No newline at end of file
diff --git a/AVLFormer/src/evalcap/utils_caption_evaluate.py b/AVLFormer/src/evalcap/utils_caption_evaluate.py
deleted file mode 100755
index 1b93e87..0000000
--- a/AVLFormer/src/evalcap/utils_caption_evaluate.py
+++ /dev/null
@@ -1,429 +0,0 @@
-from collections import OrderedDict, defaultdict
-import json
-import os.path as op
-from pprint import pprint
-import re
-import subprocess
-import tempfile
-import time
-from typing import Dict, Optional
-
-import numpy as np
-import torch
-
-from .cider.pyciderevalcap.ciderD.ciderD import CiderD
-from .coco_caption.pycocoevalcap.eval import COCOEvalCap
-from .coco_caption.pycocoevalcap.meteor.meteor import Meteor
-from .coco_caption.pycocotools.coco import COCO
-
-
-def evaluate_on_nocaps(split, predict_file, evaluate_file=None):
-    '''
-    NOTE: Put the auth file in folder ~/.evalai/
-    '''
-    if not evaluate_file:
-        evaluate_file = op.splitext(predict_file)[0] + '.eval.json'
-    if op.isfile(evaluate_file):
-        print('{} already exists'.format(evaluate_file))
-        with open(evaluate_file, 'r') as fp:
-            metrics = json.load(fp)
-        return metrics
-
-    image_info_file = 'data/nocaps/nocaps_{}_image_info.json'.format(split)
-    image_info = json.load(open(image_info_file))
-    open_image_id2id = {}
-    for it in image_info['images']:
-        open_image_id2id[it['open_images_id']] = it['id']
-    predictions = []
-    cap_id = 0
-    with open(predict_file, 'r') as fp:
-        for line in fp:
-            p = line.strip().split('\t')
-            predictions.append({
-                'image_id': open_image_id2id[p[0]],
-                'caption': json.loads(p[1])[0]['caption'],
-                'id': cap_id
-            })
-            cap_id += 1
-    #submit_file = op.join('/tmp', op.basename(predict_file) + '.submit.json')
-    #with open(submit_file, 'w') as fp:
-    #json.dump(predictions, fp)
-    if split == 'test':
-        print('Are you sure to submit test split result at: {}'.format(
-            predict_file))
-        import ipdb
-        ipdb.set_trace()
-    nocapseval = NocapsEvaluator(phase=split)
-    metrics = nocapseval.evaluate(predictions)
-    pprint(metrics)
-    with open(evaluate_file, 'w') as fp:
-        json.dump(metrics, fp)
-    return metrics
-
-
-def evaluate_on_coco_caption(res_file, label_file, outfile=None):
-    """
-    res_tsv: TSV file, each row is [image_key, json format list of captions].
-             Each caption is a dict, with fields "caption", "conf".
-             or JSON file of coco style
-    label_file: .pt file, contains dict of image key to ground truth labels.
-             or JSON file of coco style
-    """
-    if not outfile:
-        outfile = op.splitext(res_file)[0] + '.eval.json'
-
-    if res_file.endswith('.tsv'):
-        res_file_coco = op.splitext(res_file)[0] + '_coco_format.json'
-        convert_tsv_to_coco_format(res_file, res_file_coco)
-    else:
-        res_file_coco = res_file
-
-    if label_file.endswith('.pt') or label_file.endswith('.pth'):
-        label_file_coco = op.splitext(label_file)[0] + '_coco_format.json'
-        if not op.isfile(label_file_coco):
-            cap_dict = torch.load(label_file)
-            for k in cap_dict:
-                caps = json.loads(cap_dict[k])
-                assert isinstance(caps, list)
-                cap_dict[k] = caps
-            dump_labels_to_coco_format(cap_dict, label_file_coco)
-    else:
-        label_file_coco = label_file
-
-    coco = COCO(label_file_coco)
-    cocoRes = coco.loadRes(res_file_coco)
-    cocoEval = COCOEvalCap(coco, cocoRes, 'corpus')
-
-    # evaluate on a subset of images by setting
-    # cocoEval.params['image_id'] = cocoRes.getImgIds()
-    # please remove this line when evaluating the full validation set
-    cocoEval.params['image_id'] = cocoRes.getImgIds()
-
-    # evaluate results
-    # SPICE will take a few minutes the first time, but speeds up due to caching
-    cocoEval.evaluate()
-    result = cocoEval.eval
-    if not outfile:
-        print(result)
-    else:
-        with open(outfile, 'w') as fp:
-            json.dump(result, fp, indent=4)
-    return result
-
-
-def convert_tsv_to_coco_format(res_tsv,
-                               outfile,
-                               sep='\t',
-                               key_col=0,
-                               cap_col=1):
-    results = []
-    with open(res_tsv) as fp:
-        for line in fp:
-            parts = line.strip().split(sep)
-            key = parts[key_col]
-            if cap_col < len(parts):
-                caps = json.loads(parts[cap_col])
-                if len(caps) == 0:
-                    caps = [{'caption': ''}]
-                assert len(
-                    caps) == 1, 'cannot evaluate multiple captions per image'
-                cap = caps[0]['caption']
-            else:
-                # empty caption generated
-                cap = ""
-            results.append({'image_id': key, 'caption': cap})
-    with open(outfile, 'w') as fp:
-        json.dump(results, fp)
-
-
-def dump_labels_to_coco_format(cap_dict, outfile):
-    """
-    cap_dict: dict of image key to a list of captions
-    outfile: file path
-    """
-    annotations = []
-    images = []
-    cap_id = 0
-    for key, caps in cap_dict.items():
-        image_id = key
-        images.append({'id': image_id, 'file_name': key})
-        for cap in caps:
-            annotations.append({
-                'image_id': image_id,
-                'caption': cap,
-                'id': cap_id
-            })
-            cap_id += 1
-    with open(outfile, 'w') as fp:
-        json.dump(
-            {
-                'annotations': annotations,
-                'images': images,
-                'type': 'captions',
-                'info': 'dummy',
-                'licenses': 'dummy'
-            }, fp)
-
-
-class ScstRewardCriterion(torch.nn.Module):
-    CIDER_REWARD_WEIGHT = 1
-
-    def __init__(self,
-                 scorer='cider',
-                 cider_cached_tokens='corpus',
-                 baseline_type='greedy'):
-        self.scst_scorer_name = scorer
-        if self.scst_scorer_name == 'meteor':
-            self.SCST_scorer = Meteor()
-        else:
-            self.SCST_scorer = CiderD(df=cider_cached_tokens)
-
-        assert baseline_type in ['greedy', 'sample']
-        self.baseline_type = baseline_type
-        self._cur_score = None
-        super().__init__()
-
-    def forward(self, gt_res, greedy_res, sample_res, sample_logprobs):
-        batch_size = len(gt_res)
-        sample_res_size = len(sample_res)
-        seq_per_img = sample_res_size // batch_size
-
-        gen_res = []
-        gen_res.extend(sample_res)
-        gt_idx = [i // seq_per_img for i in range(sample_res_size)]
-        if self.baseline_type == 'greedy':
-            assert len(greedy_res) == batch_size
-            gen_res.extend(greedy_res)
-            gt_idx.extend([i for i in range(batch_size)])
-
-        scores = self._calculate_eval_scores(gen_res, gt_idx, gt_res)
-
-        if self.baseline_type == 'greedy':
-            baseline = scores[-batch_size:][:, np.newaxis]
-        else:
-            sc_ = scores.reshape(batch_size, seq_per_img)
-            baseline = (sc_.sum(1, keepdims=True) - sc_) / (sc_.shape[1] - 1)
-
-        # sample - baseline
-        reward = scores[:sample_res_size].reshape(batch_size, seq_per_img)
-        self._cur_score = reward.mean()
-        reward = reward - baseline
-        reward = reward.reshape(sample_res_size)
-
-        reward = torch.as_tensor(reward,
-                                 device=sample_logprobs.device,
-                                 dtype=torch.float)
-        loss = -sample_logprobs * reward
-        loss = loss.mean()
-        return loss
-
-    def get_score(self):
-        return self._cur_score
-
-    def _calculate_eval_scores(self, gen_res, gt_idx, gt_res):
-        '''
-        gen_res: generated captions, list of str
-        gt_idx: list of int, of the same length as gen_res
-        gt_res: ground truth captions, list of list of str.
-            gen_res[i] corresponds to gt_res[gt_idx[i]]
-            Each image can have multiple ground truth captions
-        '''
-        if self.scst_scorer_name == 'meteor':
-            gen_res_size = len(gen_res)
-
-            res = OrderedDict()
-            for i in range(gen_res_size):
-                res[i] = [self._wrap_sentence(gen_res[i])]
-
-            gts = OrderedDict()
-            gt_res_ = [[
-                self._wrap_sentence(gt_res[i][j])
-                for j in range(len(gt_res[i]))
-            ] for i in range(len(gt_res))]
-            for i in range(gen_res_size):
-                gts[i] = gt_res_[gt_idx[i]]
-
-            res_ = OrderedDict()
-            for i in range(len(res)):
-                res_[i] = res[i]
-
-            _, batch_cider_scores = self.SCST_scorer.compute_score(gts, res_)
-            batch_cider_scores = np.array(batch_cider_scores)
-            scores = self.CIDER_REWARD_WEIGHT * batch_cider_scores
-
-        else:
-            gen_res_size = len(gen_res)
-
-            res = OrderedDict()
-            for i in range(gen_res_size):
-                res[i] = [self._wrap_sentence(gen_res[i])]
-
-            gts = OrderedDict()
-            gt_res_ = [[
-                self._wrap_sentence(gt_res[i][j])
-                for j in range(len(gt_res[i]))
-            ] for i in range(len(gt_res))]
-            for i in range(gen_res_size):
-                gts[i] = gt_res_[gt_idx[i]]
-
-            res_ = [{
-                'image_id': i,
-                'caption': res[i]
-            } for i in range(len(res))]
-
-            _, batch_cider_scores = self.SCST_scorer.compute_score(gts, res_)
-            scores = self.CIDER_REWARD_WEIGHT * batch_cider_scores
-
-        return scores
-
-    def _wrap_sentence(self, s):
-        # ensure the sentence ends with <eos> token
-        # in order to keep consisitent with cider_cached_tokens
-        r = s.strip()
-        if r.endswith('.'):
-            r = r[:-1]
-        r += ' <eos>'
-        return r
-
-
-class NocapsEvaluator(object):
-    r"""
-    A utility class to submit model predictions on nocaps splits to EvalAI, and retrieve model
-    performance based on captioning metrics (such as CIDEr, SPICE).
-
-    Extended Summary
-    ----------------
-    This class and the training script together serve as a working example for "EvalAI in the
-    loop", showing how evaluation can be done remotely on privately held splits. Annotations
-    (captions) and evaluation-specific tools (e.g. `coco-caption <https://www.github.com/tylin/coco-caption>`_)
-    are not required locally. This enables users to select best checkpoint, perform early
-    stopping, learning rate scheduling based on a metric, etc. without actually doing evaluation.
-
-    Parameters
-    ----------
-    phase: str, optional (default = "val")
-        Which phase to evaluate on. One of "val" or "test".
-
-    Notes
-    -----
-    This class can be used for retrieving metrics on both, val and test splits. However, we
-    recommend to avoid using it for test split (at least during training). Number of allowed
-    submissions to test split on EvalAI are very less, and can exhaust in a few iterations! However,
-    the number of submissions to val split are practically infinite.
-    """
-
-    def __init__(self, phase: str = "val"):
-
-        # Constants specific to EvalAI.
-        self._challenge_id = 355
-        self._phase_id = 742 if phase == "val" else 743
-
-    def evaluate(
-            self,
-            predictions,
-            iteration: Optional[int] = None) -> Dict[str, Dict[str, float]]:
-        r"""
-        Take the model predictions (in COCO format), submit them to EvalAI, and retrieve model
-        performance based on captioning metrics.
-
-        Parameters
-        ----------
-        predictions: List[Prediction]
-            Model predictions in COCO format. They are a list of dicts with keys
-            ``{"image_id": int, "caption": str}``.
-        iteration: int, optional (default = None)
-            Training iteration where the checkpoint was evaluated.
-
-        Returns
-        -------
-        Dict[str, Dict[str, float]]
-            Model performance based on all captioning metrics. Nested dict structure::
-
-                {
-                    "B1": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-1
-                    "B2": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-2
-                    "B3": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-3
-                    "B4": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-4
-                    "METEOR": {"in-domain", "near-domain", "out-domain", "entire"},
-                    "ROUGE-L": {"in-domain", "near-domain", "out-domain", "entire"},
-                    "CIDEr": {"in-domain", "near-domain", "out-domain", "entire"},
-                    "SPICE": {"in-domain", "near-domain", "out-domain", "entire"},
-                }
-
-        """
-        # Save predictions as a json file first.
-        _, predictions_filename = tempfile.mkstemp(suffix=".json", text=True)
-        with open(predictions_filename, "w") as f:
-            json.dump(predictions, f)
-
-        submission_command = (
-            f"evalai challenge {self._challenge_id} phase {self._phase_id} "
-            f"submit --file {predictions_filename}")
-
-        submission_command_subprocess = subprocess.Popen(
-            submission_command.split(),
-            stdout=subprocess.PIPE,
-            stdin=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-        )
-
-        # This terminal output will have submission ID we need to check.
-        submission_command_stdout = submission_command_subprocess.communicate(
-            input=b"N\n")[0].decode("utf-8")
-
-        submission_id_regex = re.search("evalai submission ([0-9]+)",
-                                        submission_command_stdout)
-        try:
-            # Get an integer submission ID (as a string).
-            submission_id = submission_id_regex.group(0).split()[
-                -1]  # type: ignore
-        except:
-            # Very unlikely, but submission may fail because of some glitch. Retry for that.
-            return self.evaluate(predictions)
-
-        if iteration is not None:
-            print(
-                f"Submitted predictions for iteration {iteration}, submission id: {submission_id}."
-            )
-        else:
-            print(f"Submitted predictions, submission_id: {submission_id}")
-
-        # Placeholder stdout for a pending submission.
-        result_stdout: str = "The Submission is yet to be evaluated."
-        num_tries: int = 0
-
-        # Query every 10 seconds for result until it appears.
-        while "CIDEr" not in result_stdout:
-
-            time.sleep(10)
-            result_stdout = subprocess.check_output(
-                ["evalai", "submission", submission_id,
-                 "result"]).decode("utf-8")
-            num_tries += 1
-
-            # Raise error if it takes more than 5 minutes.
-            if num_tries == 30:
-                raise ConnectionError(
-                    "Unable to get results from EvalAI within 5 minutes!")
-
-        # Convert result to json.
-        metrics = json.loads(result_stdout, encoding="utf-8")
-
-        # keys: {"in-domain", "near-domain", "out-domain", "entire"}
-        # In each of these, keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"}
-        metrics = {
-            "in-domain": metrics[0]["in-domain"],
-            "near-domain": metrics[1]["near-domain"],
-            "out-domain": metrics[2]["out-domain"],
-            "entire": metrics[3]["entire"],
-        }
-
-        # Restructure the metrics dict for better tensorboard logging.
-        # keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"}
-        # In each of these, keys: keys: {"in-domain", "near-domain", "out-domain", "entire"}
-        flipped_metrics: Dict[str, Dict[str, float]] = defaultdict(dict)
-        for key, val in metrics.items():
-            for subkey, subval in val.items():
-                flipped_metrics[subkey][key] = subval
-
-        return flipped_metrics
diff --git a/AVLFormer/src/layers/bert/__init__.py b/AVLFormer/src/layers/bert/__init__.py
deleted file mode 100644
index 627f49c..0000000
--- a/AVLFormer/src/layers/bert/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-__version__ = "1.0.0"
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
-from .modeling_bert import (
-    BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertConfig,
-    BertForImageCaptioning,
-    BertForMaskedLM,
-    BertForMultipleChoice,
-    BertForNextSentencePrediction,
-    BertForPreTraining,
-    BertForQuestionAnswering,
-    BertForSequenceClassification,
-    BertForTokenClassification,
-    BertForVLGrounding,
-    BertImgForGroundedPreTraining,
-    BertImgForPreTraining,
-    BertModel,
-    load_tf_weights_in_bert,
-)
-from .modeling_utils import (
-    CONFIG_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    Conv1D,
-    PretrainedConfig,
-    PreTrainedModel,
-    prune_layer,
-)
-from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
diff --git a/AVLFormer/src/layers/bert/file_utils.py b/AVLFormer/src/layers/bert/file_utils.py
deleted file mode 100644
index f7797f3..0000000
--- a/AVLFormer/src/layers/bert/file_utils.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import fnmatch
-from functools import wraps
-from hashlib import sha256
-from io import open
-import json
-import os
-import shutil
-import sys
-import tempfile
-
-import boto3
-from botocore.exceptions import ClientError
-import requests
-from tqdm import tqdm
-
-try:
-    from torch.hub import _get_torch_home
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv(
-            'TORCH_HOME',
-            os.path.join(os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
-
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-
-try:
-    from pathlib import Path
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
-except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              default_cache_path)
-
-import logging
-
-from src.utils.comm import is_main_process
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-if not is_main_process():
-    logger.disabled = True
-
-
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    """
-    url_bytes = url.encode('utf-8')
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode('utf-8')
-        etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + '.json'
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata['url']
-    etag = metadata['etag']
-
-    return url, etag
-
-
-def cached_path(url_or_filename, cache_dir=None):
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif parsed.scheme == '':
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(
-                url_or_filename))
-
-
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-
-
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-
-    return wrapper
-
-
-@s3_request
-def s3_etag(url):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def http_get(url, temp_file):
-    req = requests.get(url, stream=True)
-    content_length = req.headers.get('Content-Length')
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total)
-    for chunk in req.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(url, cache_dir=None):
-    """
-    Given a URL, look for the corresponding dataset in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
-        cache_dir = str(cache_dir)
-
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-
-    # Get eTag to add to filename, if it exists.
-    if url.startswith("s3://"):
-        etag = s3_etag(url)
-    else:
-        try:
-            response = requests.head(url, allow_redirects=True)
-            if response.status_code != 200:
-                etag = None
-            else:
-                etag = response.headers.get("ETag")
-        except EnvironmentError:
-            etag = None
-
-    if sys.version_info[0] == 2 and etag is not None:
-        etag = etag.decode('utf-8')
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # If we don't have a connection (etag is None) and can't identify the file
-    # try to get the last downloaded one
-    if not os.path.exists(cache_path) and etag is None:
-        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
-        matching_files = list(
-            filter(lambda s: not s.endswith('.json'), matching_files))
-        if matching_files:
-            cache_path = os.path.join(cache_dir, matching_files[-1])
-
-    if not os.path.exists(cache_path):
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url,
-                        temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                s3_get(url, temp_file)
-            else:
-                http_get(url, temp_file)
-
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
-
-            logger.info("copying %s to cache at %s", temp_file.name,
-                        cache_path)
-            with open(cache_path, 'wb') as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
-
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
-            with open(meta_path, 'w') as meta_file:
-                output_string = json.dumps(meta)
-                if sys.version_info[0] == 2 and isinstance(output_string, str):
-                    output_string = unicode(output_string,
-                                            'utf-8')  # The beauty of python 2
-                meta_file.write(output_string)
-
-            logger.info("removing temp file %s", temp_file.name)
-
-    return cache_path
diff --git a/AVLFormer/src/layers/bert/modeling_bert.py b/AVLFormer/src/layers/bert/modeling_bert.py
deleted file mode 100644
index e99934c..0000000
--- a/AVLFormer/src/layers/bert/modeling_bert.py
+++ /dev/null
@@ -1,3146 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model. """
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-from io import open
-import json
-import logging
-import math
-import os
-import sys
-
-from src.utils.comm import is_main_process
-import torch
-from torch import nn
-from torch.distributions import Categorical, kl_divergence
-from torch.nn import CrossEntropyLoss, MSELoss
-import torch.nn.functional as F
-from torch.nn.utils.weight_norm import weight_norm
-
-from .modeling_utils import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    PretrainedConfig,
-    PreTrainedModel,
-    add_start_docstrings,
-    prune_linear_layer,
-)
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-if not is_main_process():
-    logger.disabled = True
-
-import torch.utils.checkpoint as torch_checkpoint
-
-BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    'bert-base-multilingual-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    'bert-base-multilingual-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    'bert-base-german-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-squad':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking-finetuned-squad':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-cased-finetuned-mrpc':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-}
-
-BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    'bert-base-multilingual-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    'bert-base-multilingual-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    'bert-base-german-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    'bert-large-uncased-whole-word-masking':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    'bert-large-cased-whole-word-masking':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    'bert-large-uncased-whole-word-masking-finetuned-squad':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    'bert-large-cased-whole-word-masking-finetuned-squad':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    'bert-base-cased-finetuned-mrpc':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-}
-
-
-def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
-            else:
-                try:
-                    pointer = getattr(pointer, l[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    """Implementation of the gelu activation function.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
-
-
-class BertConfig(PretrainedConfig):
-    r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
-        `BertModel`.
-
-
-        Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-    """
-    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
-        super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file,
-                      str) or (sys.version_info[0] == 2 and isinstance(
-                          vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r",
-                      encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)")
-
-
-# try:
-#     from apex.normalization.fused_layer_norm import FusedLayerNorm
-# except ImportError:
-#     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-#     FusedLayerNorm = None
-
-# class BertLayerNorm(nn.Module):
-#     def __init__(self, hidden_size, eps=1e-12):
-#         """Construct a layernorm module in the TF style (epsilon inside the square root).
-#         """
-#         super(BertLayerNorm, self).__init__()
-#         self.weight = nn.Parameter(torch.ones(hidden_size))
-#         self.bias = nn.Parameter(torch.zeros(hidden_size))
-#         self.variance_epsilon = eps
-
-#     def forward(self, x):
-#         u = x.mean(-1, keepdim=True)
-#         s = (x - u).pow(2).mean(-1, keepdim=True)
-#         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-#         return self.weight * x + self.bias
-
-# prefer the apex version, but avoid conditional import. While FusedLayerNorm can load BertLayerNorm, the other way is not true
-# LayerNormClass = FusedLayerNorm or BertLayerNorm
-LayerNormClass = torch.nn.LayerNorm
-BertLayerNorm = torch.nn.LayerNorm
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size,
-                                            config.hidden_size,
-                                            padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = LayerNormClass(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        seq_length = input_ids.size(1)
-        if position_ids is None:
-            position_ids = torch.arange(seq_length,
-                                        dtype=torch.long,
-                                        device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertImgEmbeddings(nn.Module):
-    """ BERT Language - Image Embedding
-    Construct the embeddings from word & Images, position and token_type embeddings.
-    """
-
-    def __init__(self, config):
-        super(BertImgEmbeddings, self).__init__()
-        self.img_dim = 565
-
-        self.img_embeddings = nn.Linear(self.img_dim,
-                                        self.config.hidden_size,
-                                        bias=True)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = LayerNormClass(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None, position_ids=None):
-        seq_length = input_ids.size(1)
-        if position_ids is None:
-            position_ids = torch.arange(seq_length,
-                                        dtype=torch.long,
-                                        device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        img_embeddings = self.img_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = img_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super(BertSelfAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" %
-                (config.hidden_size, config.num_attention_heads))
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size /
-                                       config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        if torch._C._get_tracing_state():
-            # exporter is not smart enough to detect dynamic size for some paths
-            x = x.view(x.shape[0], -1, self.num_attention_heads,
-                       self.attention_head_size)
-        else:
-            new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                           self.attention_head_size)
-            x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                head_mask=None,
-                history_state=None):
-        if history_state is not None:
-            x_states = torch.cat([history_state, hidden_states], dim=1)
-            mixed_query_layer = self.query(hidden_states)
-            mixed_key_layer = self.key(x_states)
-            mixed_value_layer = self.value(x_states)
-        else:
-            mixed_query_layer = self.query(hidden_states)
-            mixed_key_layer = self.key(hidden_states)
-            mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if self.output_attentions else (
-                       context_layer, )
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super(BertSelfOutput, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = LayerNormClass(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-
-    def __init__(self, config):
-        super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.self.num_attention_heads,
-                          self.self.attention_head_size)
-        for head in heads:
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-        # Update hyper params
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-
-    def forward(self,
-                input_tensor,
-                attention_mask,
-                head_mask=None,
-                history_state=None):
-        self_outputs = self.self(input_tensor, attention_mask, head_mask,
-                                 history_state)
-        attention_output = self.output(self_outputs[0], input_tensor)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super(BertIntermediate, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act,
-                      str) or (sys.version_info[0] == 2
-                               and isinstance(config.hidden_act, unicode)):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-
-    def __init__(self, config):
-        super(BertOutput, self).__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = LayerNormClass(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-
-    def __init__(self, config):
-        super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                head_mask=None,
-                history_state=None):
-        attention_outputs = self.attention(hidden_states, attention_mask,
-                                           head_mask, history_state)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output, ) + attention_outputs[
-            1:]  # add attentions if we output them
-        return outputs
-
-
-class TIMMVitEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        logger.info(config)
-        from src import timm
-        logger.info('Loading network: {}'.format(config.net))
-        logger.info('pretrained: {}'.format(config.pretrained))
-        extra_param = getattr(config, 'timm_param', {})
-        model = timm.create_model(
-            config.net,
-            pretrained=config.pretrained,
-            **extra_param,
-        )
-        self.blocks = model.blocks
-        self.patch_embed = model.patch_embed
-        self.pos_drop = model.pos_drop
-        self.pos_embed = model.pos_embed
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                head_mask=None,
-                encoder_history_states=None):
-        assert all(m is None for m in head_mask), 'not supported'
-        assert encoder_history_states is None, 'not supported'
-
-        for blk in self.blocks:
-            # hidden_states = blk(hidden_states)
-            hidden_states = blk(hidden_states, attention_mask)
-        return (hidden_states, )
-
-
-class BertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super(BertEncoder, self).__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList(
-            [BertLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def set_output_attentions(self, flag):
-        for idx in range(len(self.layer)):
-            self.layer[idx].attention.self.output_attentions = flag
-        self.output_attentions = flag
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                head_mask=None,
-                encoder_history_states=None):
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            history_state = None if encoder_history_states is None else encoder_history_states[
-                i]
-            # layer_outputs = layer_module(
-            #         hidden_states, attention_mask, head_mask[i],
-            #         history_state)
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask,
-                (None if head_mask is None else head_mask[i]),
-                history_state,
-            )
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
-
-        outputs = (hidden_states, )
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states, )
-        if self.output_attentions:
-            outputs = outputs + (all_attentions, )
-        return outputs  # outputs, (hidden states), (attentions)
-
-
-class BertPooler(nn.Module):
-
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super(BertPredictionHeadTransform, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act,
-                      str) or (sys.version_info[0] == 2
-                               and isinstance(config.hidden_act, unicode)):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = LayerNormClass(config.hidden_size,
-                                        eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super(BertLMPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states) + self.bias
-        return hidden_states
-
-
-# cclin
-class BertIFPredictionHead(nn.Module):
-    # image feature
-    def __init__(self, config):
-        super(BertIFPredictionHead, self).__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size,
-            2048,  # TODO: HACK!! cclin
-            bias=False)
-        # config.vocab_size,
-
-        self.bias = nn.Parameter(torch.zeros(2048))  # TODO: HACK!! cclin
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states) + self.bias
-        return torch.nn.functional.relu(hidden_states)
-        #return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super(BertOnlyNSPHead, self).__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config)
-        num_seq_relations = config.num_contrast_classes if hasattr(
-            config, "num_contrast_classes") else 2
-        self.seq_relationship = nn.Linear(config.hidden_size,
-                                          num_seq_relations)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-    config_class = BertConfig
-    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-
-    def __init__(self, *inputs, **kwargs):
-        super(BertPreTrainedModel, self).__init__(*inputs, **kwargs)
-
-    def init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0,
-                                       std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm) or isinstance(
-                module, LayerNormClass):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-BERT_START_DOCSTRING = r"""    The BERT model was proposed in
-    `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
-    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-    pre-trained using a combination of masked language modeling objective and next sentence prediction
-    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
-    refer to the PyTorch documentation for all matter related to general usage and behavior.
-
-    .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
-        https://arxiv.org/abs/1810.04805
-
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
-
-    Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-                ``token_type_ids:   0   0   0   0  0     0   0``
-
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1[``.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertModel(BertPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Bert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config):
-        super(BertModel, self).__init__(config)
-
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.apply(self.init_weights)
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.embeddings.word_embeddings
-        new_embeddings = self._get_resized_embeddings(old_embeddings,
-                                                      new_num_tokens)
-        self.embeddings.word_embeddings = new_embeddings
-        return self.embeddings.word_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                position_ids=None,
-                head_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(
-            dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                    -1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-                                             -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-                    -1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters(
-            )).dtype)  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids,
-                                           position_ids=position_ids,
-                                           token_type_ids=token_type_ids)
-        # add img_embedding_output and sum with embedding_output
-        #logger.info('embedding_output: %s' % str(embedding_output.shape))
-
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (
-            sequence_output,
-            pooled_output,
-        ) + encoder_outputs[
-            1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-class BertImgModel(BertPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Bert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config):
-        super(BertImgModel, self).__init__(config)
-
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.img_dim = config.img_feature_dim  #2054 #565
-        logger.info('BertImgModel Image Dimension: {}'.format(self.img_dim))
-        self.img_feature_type = config.img_feature_type
-        try:
-            self.use_img_layernorm = config.use_img_layernorm
-        except:
-            self.use_img_layernorm = None
-
-        if config.img_feature_type == 'dis_code':
-            self.code_embeddings = nn.Embedding(config.code_voc,
-                                                config.code_dim,
-                                                padding_idx=0)
-            self.img_embedding = nn.Linear(config.code_dim,
-                                           self.config.hidden_size,
-                                           bias=True)
-        elif config.img_feature_type == 'dis_code_t':  # transpose
-            self.code_embeddings = nn.Embedding(config.code_voc,
-                                                config.code_dim,
-                                                padding_idx=0)
-            self.img_embedding = nn.Linear(config.code_size,
-                                           self.config.hidden_size,
-                                           bias=True)
-        elif config.img_feature_type == 'dis_code_scale':  # scaled
-            self.input_embeddings = nn.Linear(config.code_dim,
-                                              config.code_size,
-                                              bias=True)
-            self.code_embeddings = nn.Embedding(config.code_voc,
-                                                config.code_dim,
-                                                padding_idx=0)
-            self.img_embedding = nn.Linear(config.code_dim,
-                                           self.config.hidden_size,
-                                           bias=True)
-        else:
-            self.img_embedding = nn.Linear(self.img_dim,
-                                           self.config.hidden_size,
-                                           bias=True)
-            self.dropout = nn.Dropout(config.hidden_dropout_prob)
-            if self.use_img_layernorm:
-                self.LayerNorm = LayerNormClass(config.hidden_size,
-                                                eps=config.img_layer_norm_eps)
-
-        self.apply(self.init_weights)
-        self.model_type = getattr(config, 'model_type', 'bert')
-        if self.model_type == 'TIMM_vit':
-            self.encoder = TIMMVitEncoder(config)
-
-        # re-initialize img_embedding weight
-        # self.img_embedding.weight.data.normal_(mean=0.0, std=config.img_initializer_range)
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.embeddings.word_embeddings
-        new_embeddings = self._get_resized_embeddings(old_embeddings,
-                                                      new_num_tokens)
-        self.embeddings.word_embeddings = new_embeddings
-        return self.embeddings.word_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                position_ids=None,
-                head_mask=None,
-                img_feats=None,
-                encoder_history_states=None):
-
-        if attention_mask is None:
-            if img_feats is not None:
-                attention_mask = torch.ones(
-                    (input_ids.shape[0],
-                     input_ids.shape[1] + img_feats.shape[1]),
-                    device=input_ids.device)
-            else:
-                attention_mask = torch.ones_like(input_ids)
-            #if img_feats is not None: attention_mask = torch.ones_like((input_ids.shape[0], input_ids.shape[1]+img_feats.shape[1]))
-            #else: attention_mask = torch.ones_like(input_ids)
-
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        if attention_mask.dim() == 2:
-            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        elif attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask.unsqueeze(1)
-        else:
-            raise NotImplementedError
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(
-            dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                    -1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-                                             -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-                    -1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters(
-            )).dtype)  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids,
-                                           position_ids=position_ids,
-                                           token_type_ids=token_type_ids)
-        # add img_embedding_output and sum with embedding_output
-        #logger.info('embedding_output: %s' % str(embedding_output.shape))
-        if encoder_history_states is not None:
-            if encoder_history_states[0].shape[1] != 0:
-                assert img_feats is None or img_feats.shape[
-                    1] == 0, "Cannot take image features while using encoder history states"
-
-        if img_feats is not None:
-            if self.img_feature_type == 'dis_code':
-                code_emb = self.code_embeddings(img_feats)
-                img_embedding_output = self.img_embedding(code_emb)
-            elif self.img_feature_type == 'dis_code_t':  # transpose
-                code_emb = self.code_embeddings(img_feats)
-                code_emb = code_emb.permute(0, 2, 1)
-                img_embedding_output = self.img_embedding(code_emb)
-            elif self.img_feature_type == 'dis_code_scale':  # left scaled
-                code_emb = self.code_embeddings(img_feats)
-                #scale_output =
-                # add scale ouput
-                img_embedding_output = self.img_embedding(code_emb)
-            elif self.img_feature_type == 'e2e' and self.model_type == 'TIMM_vit':
-                img_embedding_output = img_feats
-            else:
-                if torch._C._get_tracing_state():
-                    # Ugly workaround to make this work for ONNX.
-                    #  It is also valid for PyTorch bu I keep this path separate to remove once fixed in ONNX
-                    img_embedding_output = self.img_embedding(
-                        img_feats.squeeze(0)).unsqueeze(0)
-                else:
-                    img_embedding_output = self.img_embedding(img_feats)
-                #logger.info('img_embedding_output: %s' % str(img_embedding_output.shape))
-                if self.use_img_layernorm:
-                    img_embedding_output = self.LayerNorm(img_embedding_output)
-
-                # add dropout on image embedding
-                img_embedding_output = self.dropout(img_embedding_output)
-
-                # sum two embeddings
-                #padding_matrix = torch.zeros((embedding_output.shape[0], embedding_output.shape[1]-img_embedding_output.shape[1], embedding_output.shape[2])).cuda()
-                #img_embedding_output = torch.cat((padding_matrix, img_embedding_output), 1)
-                #embedding_output = embedding_output + img_embedding_output
-
-            # concatenate two embeddings
-            embedding_output = torch.cat(
-                (embedding_output, img_embedding_output), 1)
-
-        encoder_outputs = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-            head_mask=head_mask,
-            encoder_history_states=encoder_history_states)
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (
-            sequence_output,
-            pooled_output,
-        ) + encoder_outputs[
-            1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-# Place_Holder
-class LangImgModel(BertPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Bert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-
-    def __init__(self, config):
-        super(LangImgModel, self).__init__(config)
-
-        self.embeddings = BertEmbeddings(config)
-        self.img_embedding = BertImgEmbeddings(config)
-
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        #self.img_embedding = nn.Linear(565, self.config.hidden_size, bias=True)
-
-        self.apply(self.init_weights)
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.embeddings.word_embeddings
-        new_embeddings = self._get_resized_embeddings(old_embeddings,
-                                                      new_num_tokens)
-        self.embeddings.word_embeddings = new_embeddings
-        return self.embeddings.word_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                position_ids=None,
-                head_mask=None,
-                img_feats=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-            #if img_feats is not None:
-            #    attention_mask = torch.ones_like((input_ids.shape[0], input_ids.shape[1]+img_feats.shape[1]))
-            #else: attention_mask = torch.ones_like(input_ids)
-
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(
-            dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-                    -1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
-                                             -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
-                    -1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters(
-            )).dtype)  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids,
-                                           position_ids=position_ids,
-                                           token_type_ids=token_type_ids)
-        # add img_embedding_output and sum with embedding_output
-        #logger.info('embedding_output: %s' % str(embedding_output.shape))
-        if img_feats is not None:
-            img_embedding_output = self.img_embedding(img_feats)
-            #logger.info('img_embedding_output: %s' % str(img_embedding_output.shape))
-
-            # sum two embeddings
-            #padding_matrix = torch.zeros((embedding_output.shape[0], embedding_output.shape[1]-img_embedding_output.shape[1], embedding_output.shape[2])).cuda()
-            #img_embedding_output = torch.cat((padding_matrix, img_embedding_output), 1)
-            #embedding_output = embedding_output + img_embedding_output
-
-            # concatenate two embeddings
-            embedding_output = torch.cat(
-                (embedding_output, img_embedding_output), 1)
-
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (
-            sequence_output,
-            pooled_output,
-        ) + encoder_outputs[
-            1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training:
-    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForPreTraining(BertPreTrainedModel):
-    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForPreTraining(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForPreTraining, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-
-        self.apply(self.init_weights)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        self._tie_or_clone_weights(self.cls.predictions.decoder,
-                                   self.bert.embeddings.word_embeddings)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                next_sentence_label=None,
-                position_ids=None,
-                head_mask=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask)
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        outputs = (
-            prediction_scores,
-            seq_relationship_score,
-        ) + outputs[2:]  # add hidden states and attention if they are here
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            outputs = (total_loss, ) + outputs
-
-        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-
-
-class BertImgForPreTraining(BertPreTrainedModel):
-    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForPreTraining(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertImgForPreTraining, self).__init__(config)
-
-        #self.bert = BertModel(config) # original BERT
-        self.bert = BertImgModel(config)
-        self.cls = BertPreTrainingHeads(config)
-        self.num_seq_relations = config.num_contrast_classes if hasattr(
-            config, "num_contrast_classes") else 2
-
-        self.apply(self.init_weights)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        self._tie_or_clone_weights(self.cls.predictions.decoder,
-                                   self.bert.embeddings.word_embeddings)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                next_sentence_label=None,
-                position_ids=None,
-                head_mask=None,
-                img_feats=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            img_feats=img_feats)
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        outputs = (
-            prediction_scores,
-            seq_relationship_score,
-        ) + outputs[2:]  # add hidden states and attention if they are here
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, self.num_seq_relations),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            outputs = (total_loss, ) + outputs + (masked_lm_loss, )
-
-        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForMaskedLM(BertPreTrainedModel):
-    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForMaskedLM(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
-        >>> loss, prediction_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config)
-
-        self.apply(self.init_weights)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        self._tie_or_clone_weights(self.cls.predictions.decoder,
-                                   self.bert.embeddings.word_embeddings)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                position_ids=None,
-                head_mask=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask)
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        outputs = (prediction_scores, ) + outputs[
-            2:]  # Add hidden states and attention is they are here
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss, ) + outputs
-
-        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    r"""
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Next sequence prediction (classification) loss.
-        **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)``
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForNextSentencePrediction(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> seq_relationship_scores = outputs[0]
-
-    """
-
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        self.apply(self.init_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                next_sentence_label=None,
-                position_ids=None,
-                head_mask=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask)
-        pooled_output = outputs[1]
-
-        seq_relationship_score = self.cls(pooled_output)
-
-        outputs = (seq_relationship_score, ) + outputs[
-            2:]  # add hidden states and attention if they are here
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            outputs = (next_sentence_loss, ) + outputs
-
-        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
-
-
-def instance_bce_with_logits(logits, labels, reduction='mean'):
-    assert logits.dim() == 2
-
-    loss = nn.functional.binary_cross_entropy_with_logits(logits,
-                                                          labels,
-                                                          reduction=reduction)
-    if reduction == 'mean':
-        loss *= labels.size(1)
-    return loss
-
-
-@add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING,
-    BERT_INPUTS_DOCSTRING)
-class BertForSequenceClassification(BertPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-        self.loss_type = config.loss_type
-        self.config = config
-
-        #self.bert = BertModel(config) # original BERT
-        self.bert = BertImgModel(config)  # baseline 1
-
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-        self.classifier = nn.Linear(config.hidden_size,
-                                    self.config.num_labels)  # original
-        #self.classifier = weight_norm(nn.Linear(config.hidden_size, self.config.num_labels), dim=None)
-
-        self.apply(self.init_weights)
-
-    def init_code_embedding(self, em):
-        self.bert.code_embeddings.weight.data = em.clone()
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                position_ids=None,
-                head_mask=None,
-                img_feats=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            img_feats=img_feats)
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits, ) + outputs[
-            2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:  #  doing regression
-                loss_fct = MSELoss()
-                labels = labels.to(torch.float)
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-                if self.loss_type == "ranking":
-                    pos = logits[0:int(len(labels) / 2)]
-                    neg = logits[int(len(labels) / 2):]
-                    loss += (self.config.margin + neg -
-                             pos).clamp(min=0).mean()
-            else:
-                # cross-entropy loss
-                #loss_fct = CrossEntropyLoss()
-                #loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-                # Loss from BAN codebase
-                #loss = instance_bce_with_logits(logits, labels)
-
-                if self.loss_type == 'kl':
-                    # KL Loss: https://github.com/uclanlp/visualbert/blob/master/pytorch_pretrained_bert/modeling.py
-                    loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
-                    log_softmax = torch.nn.LogSoftmax(dim=-1)
-                    reshaped_logits = logits.contiguous().view(-1, 3129)
-                    reshaped_logits = log_softmax(reshaped_logits)
-                    loss = loss_fct(reshaped_logits, labels.contiguous())
-                elif self.loss_type == 'bce':  # [VQA]
-                    #logger.info('logits: {}, labels: {}'.format(logits.shape, labels.shape))
-                    loss = instance_bce_with_logits(logits, labels)
-                elif self.loss_type == 'ranking':  # [Retrieval]
-                    # [0, batch_size/2) are the positive samples,
-                    # [batch_size/2, batch_size) are the negative samples.
-                    # 1) cross_entropy loss
-                    loss_fct = CrossEntropyLoss()
-                    loss_sfmx = loss_fct(logits.view(-1, self.num_labels),
-                                         labels.view(-1))
-                    # 2) ranking loss
-                    softmax = torch.nn.Softmax(dim=1)
-                    probs = softmax(logits)[:, 1]
-                    pos_probs = probs[0:int(len(labels) / 2)]
-                    neg_probs = probs[int(len(labels) / 2):]
-                    loss_ranking = (self.config.margin + neg_probs -
-                                    pos_probs).clamp(min=0).mean()
-                    loss = loss_sfmx + loss_ranking
-                else:  # cross_entropy [GQA]
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, self.num_labels),
-                                    labels.view(-1))
-            outputs = (loss, ) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-class BertCaptioningHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertCaptioningLoss(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.label_smoothing = getattr(config, 'label_smoothing', 0)
-        self.drop_worst_ratio = getattr(config, 'drop_worst_ratio', 0)
-        self.drop_worst_after = getattr(config, 'drop_worst_after', 0)
-        self.log_soft = nn.LogSoftmax(dim=1)
-        self.kl = nn.KLDivLoss(reduction='none')
-        self.iter = 0
-
-    def forward(self, logits, target):
-        self.iter += 1
-        eps = self.label_smoothing
-        n_class = logits.size(1)
-        one_hot = torch.zeros_like(logits).scatter(1, target.view(-1, 1), 1)
-        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
-        log_prb = self.log_soft(logits)
-        loss = self.kl(log_prb, one_hot).sum(1)
-
-        if self.drop_worst_ratio > 0 and self.iter > self.drop_worst_after:
-            loss, _ = torch.topk(loss,
-                                 k=int(loss.shape[0] *
-                                       (1 - self.drop_worst_ratio)),
-                                 largest=False)
-
-        loss = loss.mean()
-
-        return loss
-
-
-class BertLmLoss(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.label_smoothing = getattr(config, 'label_smoothing', 0)
-        self.drop_worst_ratio = getattr(config, 'drop_worst_ratio', 0)
-        self.drop_worst_after = getattr(config, 'drop_worst_after', 0)
-        self.loss = torch.nn.CrossEntropyLoss(
-            ignore_index=0, label_smoothing=self.label_smoothing)
-        self.iter = 0
-
-    def forward(self, logits, target):
-        self.iter += 1
-        loss = self.loss(logits, target)
-
-        return loss
-
-
-# cclin
-class BertImgFeatureLoss(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.label_smoothing = getattr(config, 'label_smoothing', 0)
-        self.drop_worst_ratio = getattr(config, 'drop_worst_ratio', 0)
-        self.drop_worst_after = getattr(config, 'drop_worst_after', 0)
-        self.log_soft = nn.LogSoftmax(dim=1)
-        # self.kl = nn.KLDivLoss(reduction='none')
-        self.cri = nn.MSELoss()
-        # self.cri = nn.SmoothL1Loss()
-        self.iter = 0
-
-    def forward(self, logits, target):
-        self.iter += 1
-        # log_prb = self.log_soft(logits)
-        target = target.view(-1, target.shape[-1])
-        # loss = self.cri(logits, target).sum(1)
-        loss = self.cri(logits, target)
-
-        # if self.drop_worst_ratio > 0 and self.iter > self.drop_worst_after:
-        #     loss, _ = torch.topk(loss,
-        #             k=int(loss.shape[0] * (1-self.drop_worst_ratio)),
-        #             largest=False)
-
-        # loss = loss.mean()
-
-        return loss
-
-
-@add_start_docstrings("""Bert Model transformer for image captioning""",
-                      BERT_START_DOCSTRING)
-class BertForImageCaptioning(BertPreTrainedModel):
-    r"""
-    Bert for Image Captioning.
-    """
-
-    def __init__(self, config):
-        super(BertForImageCaptioning, self).__init__(config)
-        self.config = config
-        self.bert = BertImgModel(config)
-        self.cls = BertCaptioningHeads(config)
-        self.mask_loss = BertCaptioningLoss(config)
-        self.lm_loss = BertLmLoss(config)
-        # cclin
-        self.cls_img_feat = BertIFPredictionHead(config)
-        self.loss_img_feat = BertImgFeatureLoss(config)
-        # lambda
-        self.lambda_ = getattr(config, "lambda_", 0.5)
-
-        self.apply(self.init_weights)
-        self.tie_weights()
-
-        self.model_type = getattr(config, 'model_type', 'bert')
-        if self.model_type == 'TIMM_vit':
-            self.bert = BertImgModel(config)
-
-    def tie_weights(self):
-        if hasattr(self.config, 'tie_weights') and self.config.tie_weights:
-            self._tie_or_clone_weights(self.cls.predictions.decoder,
-                                       self.bert.embeddings.word_embeddings)
-        freeze = False
-        if hasattr(self.config, 'freeze_embedding'):
-            freeze = self.config.freeze_embedding
-        self.bert.embeddings.word_embeddings.weight.requires_grad = not freeze
-
-    def forward(self, *args, **kwargs):
-        is_decode = kwargs.get('is_decode', False)
-        inference_mode = kwargs.get('inference_mode', '')
-        if inference_mode:
-            kwargs.pop('inference_mode')
-            if inference_mode == 'prod':
-                return self.prod_generate(*args, **kwargs)
-            if inference_mode == 'prod_no_hidden':
-                return self.prod_no_hidden_generate(*args, **kwargs)
-            assert False, 'unknown inference_mode: {}'.format(inference_mode)
-        if is_decode:
-            return self.generate(*args, **kwargs)
-        else:
-            return self.encode_forward(*args, **kwargs)
-
-    def encode_forward(self,
-                       input_ids,
-                       img_feats,
-                       attention_mask,
-                       masked_pos=None,
-                       masked_ids=None,
-                       masked_pos_img=None,
-                       masked_token_img=None,
-                       token_type_ids=None,
-                       position_ids=None,
-                       head_mask=None,
-                       is_training=True,
-                       encoder_history_states=None,
-                       input_token_ids=None,
-                       output_token_ids=None):
-        outputs = self.bert(input_ids,
-                            img_feats=img_feats,
-                            attention_mask=attention_mask,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            head_mask=head_mask,
-                            encoder_history_states=encoder_history_states)
-
-        if is_training:
-            sequence_output = outputs[0][:, :masked_pos.shape[-1], :]
-            class_logits = self.cls(sequence_output)
-            # b, n, c -> b, c, n
-            lm_loss = self.lm_loss(class_logits.float().transpose(1, 2),
-                                   output_token_ids)
-            masked_class_logits = class_logits[
-                masked_pos ==
-                1, :]  # it is slow, but don't have better solution now
-            masked_ids = masked_ids[masked_ids != -1]  # remove padded target
-            mask_loss = self.mask_loss(masked_class_logits.float(), masked_ids)
-            loss = self.lambda_ * lm_loss + (1 - self.lambda_) * mask_loss
-
-            outputs = (
-                loss,
-                class_logits,
-                lm_loss,
-                mask_loss,
-            ) + outputs[2:]
-        else:
-            sequence_output = outputs[0][:, :input_ids.shape[-1], :]
-            class_logits = self.cls(sequence_output)
-            outputs = (class_logits, ) + outputs[2:]
-
-        return outputs
-
-    def prepare_inputs_for_generation(self, curr_ids, past=None):
-        # NOTE: if attention is on, it should be the token used to mask words in training
-        mask_token_id = self.mask_token_id
-        batch_size = curr_ids.shape[0]
-        mask_ids = torch.full((batch_size, 1),
-                              mask_token_id,
-                              dtype=torch.long,
-                              device=curr_ids.device)
-
-        def _slice(t, start, end):
-            if t is None:
-                return t
-            assert t.shape == (batch_size,
-                               self.max_seq_len + self.od_labels_len)
-            return t[:, start:end]
-
-        def _remove_elements(t, start, end):
-            if t is None:
-                return t
-            assert t.shape == (batch_size,
-                               self.max_seq_len + self.od_labels_len)
-            return torch.cat([t[:, :start], t[:, end:]], dim=1)
-
-        if past is None:
-            input_ids = torch.cat([curr_ids, mask_ids], dim=1)
-
-            curr_len = input_ids.shape[1]
-            full_len = self.max_seq_len + self.od_labels_len + self.img_seq_len
-            assert self.full_attention_mask.shape == (batch_size, full_len,
-                                                      full_len)
-
-            def _remove_rows_cols(t, row_start, row_end, col_start, col_end):
-                t00 = t[:, :row_start, :col_start]
-                t01 = t[:, :row_start, col_end:]
-                t10 = t[:, row_end:, :col_start]
-                t11 = t[:, row_end:, col_end:]
-                res = torch.cat([
-                    torch.cat([t00, t01], dim=2),
-                    torch.cat([t10, t11], dim=2)
-                ],
-                                dim=1)
-                assert res.shape == (t.shape[0],
-                                     t.shape[1] - row_end + row_start,
-                                     t.shape[2] - col_end + col_start)
-                return res
-
-            seq_start = curr_len
-            seq_end = self.max_seq_len
-            attention_mask = _remove_rows_cols(self.full_attention_mask,
-                                               seq_start, seq_end, seq_start,
-                                               seq_end)
-
-            masked_pos = _remove_elements(self.full_masked_pos, seq_start,
-                                          seq_end)
-            token_type_ids = _remove_elements(self.full_token_type_ids,
-                                              seq_start, seq_end)
-            position_ids = _remove_elements(self.full_position_ids, seq_start,
-                                            seq_end)
-            img_feats = self.img_feats
-
-            if self.add_od_labels:
-                assert self.od_label_ids.shape[1] == self.od_labels_len
-                input_ids = torch.cat([input_ids, self.od_label_ids], dim=1)
-        else:
-            last_token = curr_ids[:, -1:]
-            # The representation of last token should be re-computed, because
-            # it depends on both self-attention context and input tensor
-            input_ids = torch.cat([last_token, mask_ids], dim=1)
-            start_pos = curr_ids.shape[1] - 1
-            end_pos = start_pos + input_ids.shape[1]
-            masked_pos = _slice(self.full_masked_pos, start_pos, end_pos)
-            token_type_ids = _slice(self.full_token_type_ids, start_pos,
-                                    end_pos)
-            position_ids = _slice(self.full_position_ids, start_pos, end_pos)
-
-            img_feats = None
-            assert past[0].shape[0] == batch_size
-            if self.prev_encoded_layers is None:
-                assert start_pos == 1  # the first token after BOS
-                assert past[0].shape[
-                    1] == 2 + self.od_labels_len + self.img_seq_len
-                # reorder to [od_labels, img_feats, sentence]
-                self.prev_encoded_layers = [
-                    torch.cat([x[:, 2:, :], x[:, :start_pos, :]], dim=1)
-                    for x in past
-                ]
-                s2s = self.full_attention_mask[:, :self.max_seq_len, :self.
-                                               max_seq_len]
-                s2i = self.full_attention_mask[:, :self.max_seq_len,
-                                               self.max_seq_len:]
-                i2s = self.full_attention_mask[:, self.max_seq_len:, :self.
-                                               max_seq_len]
-                i2i = self.full_attention_mask[:, self.max_seq_len:,
-                                               self.max_seq_len:]
-                self.full_attention_mask = torch.cat([
-                    torch.cat([i2i, i2s], dim=2),
-                    torch.cat([s2i, s2s], dim=2)
-                ],
-                                                     dim=1)
-            else:
-                assert start_pos > 1
-                assert past[0].shape[1] == 2
-                self.prev_encoded_layers = [
-                    torch.cat([x, p[:, :-1, :]], dim=1)
-                    for x, p in zip(self.prev_encoded_layers, past)
-                ]
-
-            attention_mask = self.full_attention_mask[:, self.od_labels_len +
-                                                      self.img_seq_len +
-                                                      start_pos:self.
-                                                      od_labels_len +
-                                                      self.img_seq_len +
-                                                      end_pos, :self.
-                                                      od_labels_len +
-                                                      self.img_seq_len +
-                                                      end_pos]
-
-        return {
-            'input_ids': input_ids,
-            'img_feats': img_feats,
-            'masked_pos': masked_pos,
-            'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids,
-            'position_ids': position_ids,
-            'is_training': False,
-            'encoder_history_states': self.prev_encoded_layers
-        }
-
-    def get_output_embeddings(self):
-        return self.decoder
-
-    def generate(self,
-                 img_feats,
-                 attention_mask,
-                 masked_pos,
-                 token_type_ids=None,
-                 position_ids=None,
-                 head_mask=None,
-                 input_ids=None,
-                 max_length=None,
-                 do_sample=None,
-                 num_beams=None,
-                 temperature=None,
-                 top_k=None,
-                 top_p=None,
-                 repetition_penalty=None,
-                 bos_token_id=None,
-                 pad_token_id=None,
-                 eos_token_ids=None,
-                 mask_token_id=None,
-                 length_penalty=None,
-                 num_return_sequences=None,
-                 num_keep_best=1,
-                 is_decode=None,
-                 add_od_labels=False,
-                 od_labels_start_posid=None,
-                 use_cbs=False,
-                 fsm=None,
-                 num_constraints=None,
-                 min_constraints_to_satisfy=None,
-                 use_hypo=False,
-                 decoding_constraint_flag=None,
-                 bad_ending_ids=None,
-                 input_token_ids=None,
-                 output_token_ids=None):
-        """ Generates captions given image features
-        """
-        assert is_decode
-        batch_size = img_feats.shape[0]
-        self.img_seq_len = img_feats.shape[1]
-        self.max_seq_len = max_length
-        self.mask_token_id = mask_token_id
-        self.prev_encoded_layers = None
-        # NOTE: num_keep_best is not equavilant to num_return_sequences
-        # num_keep_best is the number of hypotheses to keep in beam search
-        # num_return_sequences is the repeating times of input, coupled with
-        # do_sample=True can generate more than one samples per image
-        self.num_keep_best = num_keep_best
-
-        vocab_size = self.config.vocab_size
-        if not use_cbs:
-            num_fsm_states = 1
-        else:
-            b, num_fsm_states, f1, v = fsm.shape
-            assert b == batch_size and v == vocab_size and f1 == num_fsm_states
-
-        self.add_od_labels = add_od_labels
-        # avoid position_ids collision of caption and od labels
-        self.od_labels_start_posid = max(od_labels_start_posid,
-                                         self.max_seq_len)
-        if self.add_od_labels:
-            # get od labels part from input_ids
-            assert input_ids.shape[0] == batch_size
-            od_label_ids = input_ids[:, self.max_seq_len:]
-            self.od_labels_len = input_ids.shape[1] - self.max_seq_len
-            input_ids = None
-        else:
-            self.od_labels_len = 0
-            od_label_ids = None
-            assert input_ids.shape == (batch_size, self.max_seq_len)
-            input_ids = None
-
-        if input_ids is None:
-            input_ids = torch.full((batch_size, 1),
-                                   bos_token_id,
-                                   dtype=torch.long,
-                                   device=next(self.parameters()).device)
-        else:
-            assert input_ids.dim(
-            ) == 2, "Input prompt should be of shape (batch_size, sequence length)."
-            assert input_ids.shape[
-                0] == batch_size, "Input batch size must match image features"
-
-        cur_len = input_ids.shape[1]
-        if num_return_sequences != 1:
-            # Expand input to num return sequences
-            input_ids = self._expand_for_beams(input_ids, num_return_sequences)
-            effective_batch_size = batch_size * num_return_sequences
-        else:
-            effective_batch_size = batch_size
-
-        if position_ids is None:
-            position_ids = torch.arange(self.max_seq_len,
-                                        dtype=torch.long,
-                                        device=input_ids.device)
-            posids_len = self.max_seq_len
-            if self.add_od_labels:
-                od_labels_posids = torch.arange(self.od_labels_start_posid,
-                                                self.od_labels_start_posid +
-                                                self.od_labels_len,
-                                                dtype=torch.long,
-                                                device=input_ids.device)
-                position_ids = torch.cat([position_ids, od_labels_posids])
-                posids_len += self.od_labels_len
-            position_ids = position_ids.unsqueeze(0).expand(
-                [batch_size, posids_len])
-
-        num_expand = num_beams * num_fsm_states * num_return_sequences
-        self.od_label_ids = self._expand_for_beams(od_label_ids, num_expand)
-        self.img_feats = self._expand_for_beams(img_feats, num_expand)
-        self.full_attention_mask = self._expand_for_beams(
-            attention_mask, num_expand)
-        self.full_masked_pos = self._expand_for_beams(masked_pos, num_expand)
-        self.full_token_type_ids = self._expand_for_beams(
-            token_type_ids, num_expand)
-        self.full_position_ids = self._expand_for_beams(
-            position_ids, num_expand)
-        self.full_head_mask = self._expand_for_beams(head_mask, num_expand)
-
-        if not use_cbs:
-            if num_beams > 1:
-                output = self._generate_beam_search(
-                    input_ids,
-                    cur_len,
-                    max_length,
-                    do_sample,
-                    temperature,
-                    top_k,
-                    top_p,
-                    repetition_penalty,
-                    pad_token_id,
-                    eos_token_ids,
-                    effective_batch_size,
-                    length_penalty,
-                    num_beams,
-                    vocab_size,
-                )
-            else:
-                output = self._generate_no_beam_search(
-                    input_ids,
-                    cur_len,
-                    max_length,
-                    do_sample,
-                    temperature,
-                    top_k,
-                    top_p,
-                    repetition_penalty,
-                    pad_token_id,
-                    eos_token_ids,
-                    effective_batch_size,
-                )
-        else:
-            from src.modeling.utils_cbs import (
-                ConstrainedBeamSearch,
-                select_best_beam_with_constraints,
-            )
-            assert self.num_keep_best == 1, 'not supported n_best > 1 for CBS'
-            searcher = ConstrainedBeamSearch(
-                eos_token_ids,
-                max_length,
-                num_beams,
-                use_hypo=use_hypo,
-                decoding_constraint_flag=decoding_constraint_flag,
-                bad_ending_ids=bad_ending_ids)
-
-            curr_ids, sum_logprobs = searcher.search(
-                input_ids,
-                None,
-                self._decode_step,
-                fsm,
-            )
-
-            curr_ids, logprobs = select_best_beam_with_constraints(
-                curr_ids,
-                sum_logprobs,
-                num_constraints,
-                min_constraints_to_satisfy,
-                eos_token_ids,
-            )
-            # (batch_size, n_best, max_len), (batch_size, n_best)
-            output = (curr_ids.unsqueeze(1), logprobs.unsqueeze(1))
-
-        return output
-
-    def _expand_for_beams(self, x, num_expand):
-        if x is None or num_expand == 1:
-            return x
-
-        input_shape = list(x.shape)
-        expanded_shape = input_shape[:1] + [num_expand] + input_shape[1:]
-        x = x.unsqueeze(1).expand(expanded_shape)
-        # (batch_size * num_expand, ...)
-        x = x.contiguous().view([input_shape[0] * num_expand] +
-                                input_shape[1:])
-        return x
-
-    def _do_output_past(self, outputs):
-        return len(outputs) > 1
-
-    def prod_generate(
-        self,
-        img_feats,
-        od_label_ids,
-        max_length,
-        bos_token_id,
-        eos_token_ids,
-        mask_token_id,
-        od_labels_start_posid,
-        add_od_labels=True,
-        cls_token_segment_id=0,
-        sequence_a_segment_id=0,
-        sequence_b_segment_id=1,
-    ):
-        """ Generates captions for PROD, batch size=1, num_beams=1.
-            Use faster generation where output_hidden_states = True
-        """
-        batch_size = img_feats.shape[0]
-        assert batch_size == 1
-        device = img_feats.device
-        assert od_label_ids.shape[0] == batch_size
-        od_labels_len = od_label_ids.shape[1]
-        img_seq_len = img_feats.shape[1]
-
-        mask_ids = torch.full((1, 1),
-                              mask_token_id,
-                              dtype=torch.long,
-                              device=device)
-
-        # prepare inputs
-        cur_ids = torch.full((1, 1),
-                             bos_token_id,
-                             dtype=torch.long,
-                             device=device)
-
-        input_ids = torch.cat([cur_ids, mask_ids, od_label_ids], dim=1)
-        token_type_ids = torch.cat([
-            torch.tensor([[cls_token_segment_id, sequence_a_segment_id]],
-                         dtype=torch.long,
-                         device=device),
-            torch.full((1, od_labels_len),
-                       sequence_b_segment_id,
-                       dtype=torch.long,
-                       device=device)
-        ],
-                                   dim=1)
-
-        position_ids = torch.arange(2, dtype=torch.long, device=device)
-        od_labels_start_posid = max(od_labels_start_posid, max_length)
-        if add_od_labels:
-            od_labels_posids = torch.arange(od_labels_start_posid,
-                                            od_labels_start_posid +
-                                            od_labels_len,
-                                            dtype=torch.long,
-                                            device=device)
-            position_ids = torch.cat([position_ids, od_labels_posids])
-        posids_len = 2 + od_labels_len
-        position_ids = position_ids.unsqueeze(0).expand([1, posids_len])
-
-        attention_mask = torch.ones((1, 2 + od_labels_len + img_seq_len,
-                                     2 + od_labels_len + img_seq_len),
-                                    dtype=torch.long,
-                                    device=device)
-        attention_mask[:, 0,
-                       1] = 0  # words in sentence can not see words after itself
-        attention_mask[:,
-                       2:, :2] = 0  # od_label, img_feat can not see sentence
-
-        # make empty history states for the first step
-        encoder_history_states = tuple(
-            torch.empty([1, 0, self.config.hidden_size], device=device)
-            for _ in range(self.config.num_hidden_layers))
-
-        # prepare inputs for >1 steps
-        token_type_ids_after_first = torch.full([1, 2],
-                                                sequence_a_segment_id,
-                                                dtype=torch.long,
-                                                device=device)
-        img_feats_after_first = torch.empty(
-            [1, 0, self.config.img_feature_dim],
-            device=device)  # place holder to avoid None
-
-        # initial model inputs for the first step
-        model_inputs = {
-            'input_ids': input_ids,
-            'img_feats': img_feats,
-            'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids,
-            'position_ids': position_ids,
-            'is_training': False,
-            'encoder_history_states': encoder_history_states
-        }
-        cur_len = cur_ids.shape[1]
-        sum_logprob = 0
-        while True:
-            outputs = self(**model_inputs)
-
-            assert self._do_output_past(outputs)
-            if cur_len == 1:
-                assert outputs[0].shape[1] == 2 + od_labels_len
-            else:
-                assert cur_len > 1
-                assert outputs[0].shape[1] == 2
-
-            # greedy decoding
-            next_token_idx = 1
-            next_token_logits = outputs[0][:, next_token_idx, :]
-            next_token = torch.argmax(next_token_logits, dim=-1)
-            # Compute scores
-            _scores = F.log_softmax(next_token_logits,
-                                    dim=-1)  # (batch_size, vocab_size)
-            sum_logprob += _scores[:, next_token].item()
-
-            if next_token in eos_token_ids:
-                break
-            cur_ids = torch.cat([cur_ids, next_token.unsqueeze(-1)], dim=-1)
-            cur_len = cur_ids.shape[1]
-            if cur_len == max_length:
-                break
-
-            # prepare model inputs for the next step
-            past = outputs[1]
-            last_token = cur_ids[:, -1:]
-            input_ids = torch.cat([last_token, mask_ids], dim=1)
-            position_ids = torch.arange(cur_len - 1,
-                                        cur_len + 1,
-                                        dtype=torch.long,
-                                        device=device)
-            attention_mask = torch.ones(
-                [1, 2, od_labels_len + img_seq_len + cur_len + 1],
-                dtype=torch.long,
-                device=device)
-            attention_mask[:, 0, -1] = 0
-            assert past[0].shape[0] == batch_size
-            # special handle for the first step
-            if cur_len == 2:
-                assert past[0].shape[1] == 2 + od_labels_len + img_seq_len
-                # remove the first token after BOS
-                # reorder to [od_labels, img_feats, sentence]
-                encoder_history_states = [
-                    torch.cat([x[:, 2:, :], x[:, :1, :]], dim=1) for x in past
-                ]
-            else:
-                assert cur_len > 2
-                assert past[0].shape[1] == 2
-                encoder_history_states = [
-                    torch.cat([x, p[:, :-1, :]], dim=1)
-                    for x, p in zip(encoder_history_states, past)
-                ]
-
-            model_inputs = {
-                'input_ids': input_ids,
-                'img_feats': img_feats_after_first,
-                'attention_mask': attention_mask,
-                'token_type_ids': token_type_ids_after_first,
-                'position_ids': position_ids,
-                'is_training': False,
-                'encoder_history_states': encoder_history_states
-            }
-
-        logprob = sum_logprob / cur_ids.shape[1]
-
-        # (batch_size, max_len), (batch_size)
-        return cur_ids, torch.full((1, ), logprob, device=device)
-
-    def prod_no_hidden_generate(
-        self,
-        img_feats,
-        od_label_ids,
-        max_length,
-        bos_token_id,
-        eos_token_ids,
-        mask_token_id,
-        od_labels_start_posid,
-        add_od_labels=True,
-        cls_token_segment_id=0,
-        sequence_a_segment_id=0,
-        sequence_b_segment_id=1,
-    ):
-        """ Generates captions for PROD, batch size=1, num_beams=1.
-            Use output_hidden_states = False
-        """
-        batch_size = img_feats.shape[0]
-        assert batch_size == 1
-        device = img_feats.device
-        assert od_label_ids.shape[0] == batch_size
-        od_labels_len = od_label_ids.shape[1]
-        img_seq_len = img_feats.shape[1]
-
-        mask_ids = torch.full((1, 1),
-                              mask_token_id,
-                              dtype=torch.long,
-                              device=device)
-
-        # prepare inputs
-        cur_ids = torch.full((1, 1),
-                             bos_token_id,
-                             dtype=torch.long,
-                             device=device)
-        od_labels_start_posid = max(od_labels_start_posid, max_length)
-        triangle_mask = torch.tril(
-            torch.ones([max_length, max_length],
-                       dtype=torch.long,
-                       device=device))
-
-        def _prepare_inputs(cur_ids):
-            cur_len = cur_ids.shape[1]
-            input_ids = torch.cat([cur_ids, mask_ids, od_label_ids], dim=1)
-            token_type_ids = torch.cat([
-                torch.tensor(
-                    [[cls_token_segment_id]], dtype=torch.long, device=device),
-                torch.full((1, cur_len),
-                           sequence_a_segment_id,
-                           dtype=torch.long,
-                           device=device),
-                torch.full((1, od_labels_len),
-                           sequence_b_segment_id,
-                           dtype=torch.long,
-                           device=device)
-            ],
-                                       dim=1)
-
-            token_len = cur_len + 1
-            position_ids = torch.arange(token_len,
-                                        dtype=torch.long,
-                                        device=device)
-            if add_od_labels:
-                od_labels_posids = torch.arange(od_labels_start_posid,
-                                                od_labels_start_posid +
-                                                od_labels_len,
-                                                dtype=torch.long,
-                                                device=device)
-                position_ids = torch.cat([position_ids, od_labels_posids])
-            posids_len = token_len + od_labels_len
-            position_ids = position_ids.unsqueeze(0).expand([1, posids_len])
-
-            attention_mask = torch.ones(
-                (1, token_len + od_labels_len + img_seq_len,
-                 token_len + od_labels_len + img_seq_len),
-                dtype=torch.long,
-                device=device)
-            attention_mask[:, :token_len, :token_len].copy_(
-                triangle_mask[:token_len, :token_len])
-            attention_mask[:, token_len:, :
-                           token_len] = 0  # od_label, img_feat can not see sentence
-            return input_ids, token_type_ids, position_ids, attention_mask
-
-        # initial model inputs for the first step
-        input_ids, token_type_ids, position_ids, attention_mask = \
-                _prepare_inputs(cur_ids)
-        model_inputs = {
-            'input_ids': input_ids,
-            'img_feats': img_feats,
-            'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids,
-            'position_ids': position_ids,
-            'is_training': False,
-        }
-        cur_len = cur_ids.shape[1]
-        sum_logprob = 0
-        while True:
-            outputs = self(**model_inputs)
-
-            assert not self._do_output_past(outputs)
-
-            # greedy decoding
-            next_token_idx = cur_len
-            next_token_logits = outputs[0][:, next_token_idx, :]
-            next_token = torch.argmax(next_token_logits, dim=-1)
-            # Compute scores
-            _scores = F.log_softmax(next_token_logits,
-                                    dim=-1)  # (batch_size, vocab_size)
-            sum_logprob += _scores[:, next_token].item()
-
-            if next_token in eos_token_ids:
-                break
-            cur_ids = torch.cat([cur_ids, next_token.unsqueeze(-1)], dim=-1)
-            cur_len = cur_ids.shape[1]
-            if cur_len == max_length:
-                break
-
-            # prepare model inputs for the next step
-            input_ids, token_type_ids, position_ids, attention_mask = \
-                    _prepare_inputs(cur_ids)
-            model_inputs = {
-                'input_ids': input_ids,
-                'img_feats': img_feats,
-                'attention_mask': attention_mask,
-                'token_type_ids': token_type_ids,
-                'position_ids': position_ids,
-                'is_training': False,
-            }
-
-        logprob = sum_logprob / cur_ids.shape[1]
-
-        # (batch_size, max_len), (batch_size)
-        return cur_ids, torch.full((1, ), logprob, device=device)
-
-
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING)
-class BertForMultipleChoice(BertPreTrainedModel):
-    r"""
-    Inputs:
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Indices of input sequence tokens in the vocabulary.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
-                ``token_type_ids:   0   0   0   0  0     0   0``
-
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            The second dimension of the input (`num_choices`) indicates the number of choices to score.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss.
-        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above).
-            Classification scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForMultipleChoice(config)
-        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, classification_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForMultipleChoice, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.apply(self.init_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                position_ids=None,
-                head_mask=None):
-        num_choices = input_ids.shape[1]
-
-        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
-        flat_position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        outputs = self.bert(flat_input_ids,
-                            position_ids=flat_position_ids,
-                            token_type_ids=flat_token_type_ids,
-                            attention_mask=flat_attention_mask,
-                            head_mask=head_mask)
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        outputs = (reshaped_logits, ) + outputs[
-            2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss, ) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForTokenClassification(BertPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels]``.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss.
-        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
-            Classification scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForTokenClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                position_ids=None,
-                head_mask=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask)
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits, ) + outputs[
-            2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            outputs = (loss, ) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForQuestionAnswering(BertPreTrainedModel):
-    r"""
-        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>>
-        >>> model = BertForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
-
-    """
-
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                start_positions=None,
-                end_positions=None,
-                position_ids=None,
-                head_mask=None):
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask)
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (
-            start_logits,
-            end_logits,
-        ) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss, ) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with an extra single-head attention layer on top of embeddings to ground regions to text.""",
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertForVLGrounding(BertPreTrainedModel):
-    r"""
-        **labels_attention_mask**: ``torch.FloatTensor`` of shape ``(batch_size, seq_length, attr_length)``:
-            Mask to perform attention between text captions + tags (of length seq_length) and region features (of length attr_length).
-            Like everywhere else in this repo, 1.0 represents KEEP and 0.0 represents MASK OUT.
-    """
-
-    def __init__(self, config):
-        super(BertForVLGrounding, self).__init__(config)
-        self.attention_head_size = config.hidden_size // config.num_attention_heads
-        self.bert = BertImgModel(config)
-        self.query_layer = nn.Linear(config.hidden_size,
-                                     self.attention_head_size)
-        self.key_layer = nn.Linear(config.hidden_size,
-                                   self.attention_head_size)
-
-    def transpose_for_scores(self, x):
-        # this function is copied from the BertSelfAttention class
-        new_x_shape = x.size()[:-1] + (1, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                img_feats=None,
-                labels_attention_mask=None):
-        # standard conversion of labels_attention_mask to be added to pre-softmax scores
-        labels_attention_mask = labels_attention_mask.unsqueeze(1)
-        labels_attention_mask = (1.0 - labels_attention_mask) * -10000.0
-
-        # standard sequence output to take bert embeddings
-        outputs = self.bert(input_ids,
-                            token_type_ids,
-                            attention_mask=attention_mask,
-                            img_feats=img_feats)
-        sequence_output = outputs[0]
-
-        # extract queries correspond to input_ids, keys correspond to img_feats
-        num_img_regions = img_feats.shape[1]
-        queries = sequence_output[:, :-num_img_regions, :]
-        keys = sequence_output[:, -num_img_regions:, :]
-        queries = self.transpose_for_scores(self.query_layer(queries))
-        keys = self.transpose_for_scores(self.key_layer(keys))
-
-        # take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(queries, keys.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        attention_scores = attention_scores + labels_attention_mask
-
-        # squeeze the dimension corresponding to attention heads (we only have one)
-        attention_scores = attention_scores.squeeze(1)
-        return outputs + (attention_scores, )
-
-
-@add_start_docstrings(
-    """Bert Pre-Training Model with an extra single-head attention layer on top of embeddings to ground regions to text.""",
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
-class BertImgForGroundedPreTraining(BertImgForPreTraining):
-    r"""
-        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-        **labels_attention_mask**: ``torch.FloatTensor`` of shape ``(batch_size, seq_length, attr_length)``:
-            Mask to perform attention between text captions + tags (of length seq_length) and region features (of length attr_length).
-            Like everywhere else in this repo, 1.0 represents KEEP and 0.0 represents MASK OUT.
-
-    Outputs:
-        **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss and the grounding.
-    """
-
-    def __init__(self, config):
-        super(BertImgForGroundedPreTraining, self).__init__(config)
-        self.attention_head_size = config.hidden_size // config.num_attention_heads
-        self.query_layer = nn.Linear(config.hidden_size,
-                                     self.attention_head_size)
-        self.key_layer = nn.Linear(config.hidden_size,
-                                   self.attention_head_size)
-        self.apply(self.init_weights)
-
-    def transpose_for_scores(self, x):
-        # this function is copied from the BertSelfAttention class
-        new_x_shape = x.size()[:-1] + (1, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                masked_lm_labels=None,
-                next_sentence_label=None,
-                position_ids=None,
-                head_mask=None,
-                img_feats=None,
-                grounding_labels=None,
-                grounding_mask=None,
-                grounding_weight=1.0):
-
-        # standard sequence output to take bert embeddings
-        outputs = self.bert(input_ids,
-                            position_ids=position_ids,
-                            token_type_ids=token_type_ids,
-                            attention_mask=attention_mask,
-                            head_mask=head_mask,
-                            img_feats=img_feats)
-        sequence_output, pooled_output = outputs[:2]
-
-        # compute masked token predictions and contrastive prediction
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-        outputs = (
-            prediction_scores,
-            seq_relationship_score,
-        ) + outputs[2:]  # add hidden states and attention if they are here
-
-        # compute loss
-        if masked_lm_labels is not None and next_sentence_label is not None and grounding_labels is not None and grounding_mask is not None:
-
-            # losses below are both scalars
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, self.num_seq_relations),
-                next_sentence_label.view(-1))
-
-            # standard conversion of labels_attention_mask to be added to pre-softmax scores
-            grounding_mask = grounding_mask.unsqueeze(1)
-            grounding_mask = (1.0 - grounding_mask) * -10000.0
-
-            # extract queries correspond to input_ids, keys correspond to img_feats
-            num_img_regions = img_feats.shape[1]
-            queries = sequence_output[:, :-num_img_regions, :]
-            keys = sequence_output[:, -num_img_regions:, :]
-            queries = self.transpose_for_scores(self.query_layer(queries))
-            keys = self.transpose_for_scores(self.key_layer(keys))
-
-            # take the dot product between "query" and "key" to get the raw attention scores.
-            attention_logits = torch.matmul(queries, keys.transpose(-1, -2))
-            attention_logits = attention_logits / math.sqrt(
-                self.attention_head_size)
-            attention_logits = attention_logits + grounding_mask
-
-            # squeeze the dimension corresponding to attention heads (we only have one)
-            attention_logits = attention_logits.squeeze(1)
-
-            # used to only consider losses for tokens corresponding to phrases;
-            # this mask has shape (batch_size, number of tokens)
-            loss_mask = (grounding_labels.sum(dim=-1) > 0).float()
-
-            if loss_mask.sum() == 0:
-                grounding_loss = torch.zeros_like(masked_lm_loss)
-            else:
-                grounding_loss = kl_divergence(
-                    Categorical(
-                        probs=grounding_labels[loss_mask == 1].float()),
-                    Categorical(
-                        logits=attention_logits[loss_mask == 1])).mean()
-
-            total_loss = masked_lm_loss + next_sentence_loss + grounding_weight * grounding_loss
-            outputs = (total_loss, ) + outputs + (masked_lm_loss,
-                                                  grounding_loss)
-
-        return outputs
diff --git a/AVLFormer/src/layers/bert/modeling_utils.py b/AVLFormer/src/layers/bert/modeling_utils.py
deleted file mode 100644
index fc722f8..0000000
--- a/AVLFormer/src/layers/bert/modeling_utils.py
+++ /dev/null
@@ -1,1754 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import copy
-from io import open
-import json
-import logging
-import os
-
-import six
-from src.utils.comm import is_main_process
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-
-from .file_utils import cached_path
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-if not is_main_process():
-    logger.disabled = True
-
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
-TF_WEIGHTS_NAME = 'model.ckpt'
-
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super(Identity, self).__init__()
-
-        def forward(self, input):
-            return input
-
-
-if not six.PY2:
-
-    def add_start_docstrings(*docstr):
-
-        def docstring_decorator(fn):
-            fn.__doc__ = ''.join(docstr) + fn.__doc__
-            return fn
-
-        return docstring_decorator
-else:
-    # Not possible to update class docstrings on python2
-    def add_start_docstrings(*docstr):
-
-        def docstring_decorator(fn):
-            return fn
-
-        return docstring_decorator
-
-
-class PretrainedConfig(object):
-    """ Base class for all configuration classes.
-        Handle a few common parameters and methods for loading/downloading/saving configurations.
-    """
-    pretrained_config_archive_map = {}
-
-    def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
-        self.output_attentions = kwargs.pop('output_attentions', False)
-        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
-        self.torchscript = kwargs.pop('torchscript', False)
-
-    def save_pretrained(self, save_directory):
-        """ Save a configuration object to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
-
-        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            **return_unused_kwargs**: (`optional`) bool:
-                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
-
-        Examples::
-
-            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            >>> config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            >>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            >>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            >>> assert config.output_attention == True
-            >>> config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-            >>>                                                    foo=False, return_unused_kwargs=True)
-            >>> assert config.output_attention == True
-            >>> assert unused_kwargs == {'foo': False}
-
-        """
-        cache_dir = kwargs.pop('cache_dir', None)
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
-
-        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-            config_file = cls.pretrained_config_archive_map[
-                pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path,
-                                       CONFIG_NAME)
-        else:
-            config_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_config_file = cached_path(config_file,
-                                               cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file."
-                    .format(config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file))
-            return None
-        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info(
-                "loading configuration file {} from cache at {}".format(
-                    config_file, resolved_config_file))
-
-        # Load config
-        config = cls.from_json_file(resolved_config_file)
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        # add img_layer_norm_eps, use_img_layernorm
-        if "img_layer_norm_eps" in kwargs:
-            setattr(config, "img_layer_norm_eps", kwargs["img_layer_norm_eps"])
-            to_remove.append("img_layer_norm_eps")
-        if "use_img_layernorm" in kwargs:
-            setattr(config, "use_img_layernorm", kwargs["use_img_layernorm"])
-            to_remove.append("use_img_layernorm")
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config %s", config)
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class PreTrainedModel(nn.Module):
-    """ Base class for all models. Handle loading/storing model config and
-        a simple interface for dowloading and loading pretrained models.
-    """
-    config_class = PretrainedConfig
-    pretrained_model_archive_map = {}
-    load_tf_weights = lambda model, config, path: None
-    base_model_prefix = ""
-    input_embeddings = None
-
-    def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedModel, self).__init__()
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
-        # Save config in model
-        self.config = config
-
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Module from a provided token Embedding Module.
-            Increasing the size will add newly initialized vectors at the end
-            Reducing the size will remove vectors from the end
-
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``torch.nn.Embeddings``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
-        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device)
-
-        # initialize all new embeddings (in particular added tokens)
-        self.init_weights(new_embeddings)
-
-        # Copy word embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:
-                                   num_tokens_to_copy, :] = old_embeddings.weight.data[:
-                                                                                       num_tokens_to_copy, :]
-
-        return new_embeddings
-
-    def _tie_or_clone_weights(self, first_module, second_module):
-        """ Tie or clone module weights depending of weither we are using TorchScript or not
-        """
-        if self.config.torchscript:
-            first_module.weight = nn.Parameter(second_module.weight.clone())
-        else:
-            first_module.weight = second_module.weight
-
-    def resize_token_embeddings(self, new_num_tokens=None):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-            Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
-
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
-
-        Return: ``torch.nn.Embeddings``
-            Pointer to the input tokens Embedding Module of the model
-        """
-        base_model = getattr(self, self.base_model_prefix,
-                             self)  # get the base model if needed
-        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
-            return model_embeds
-
-        # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        base_model.vocab_size = new_num_tokens
-
-        # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
-            self.tie_weights()
-
-        return model_embeds
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the base model.
-            Args:
-                heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
-        """
-        base_model = getattr(self, self.base_model_prefix,
-                             self)  # get the base model if needed
-        base_model._prune_heads(heads_to_prune)
-
-    def save_pretrained(self, save_directory):
-        """ Save a model with its configuration file to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # Only save the model it-self if we are using distributed training
-        model_to_save = self.module if hasattr(self, 'module') else self
-
-        # Save configuration file
-        model_to_save.config.save_pretrained(save_directory)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
-                        **kwargs):
-        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
-
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
-                from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
-
-        Examples::
-
-            >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            >>> assert model.config.output_attention == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        config = kwargs.pop('config', None)
-        state_dict = kwargs.pop('state_dict', None)
-        cache_dir = kwargs.pop('cache_dir', None)
-        from_tf = kwargs.pop('from_tf', False)
-        output_loading_info = kwargs.pop('output_loading_info', False)
-
-        # Load config
-        if config is None:
-            config, model_kwargs = cls.config_class.from_pretrained(
-                pretrained_model_name_or_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                **kwargs)
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-            archive_file = cls.pretrained_model_archive_map[
-                pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path,
-                                            TF_WEIGHTS_NAME + ".index")
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path,
-                                            WEIGHTS_NAME)
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file,
-                                                cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights."
-                    .format(archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_model_archive_map.keys()),
-                        archive_file))
-            return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading weights file {}".format(archive_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return cls.load_tf_weights(
-                model, config,
-                resolved_archive_file[:-6])  # Remove the '.index'
-
-        # Convert old format to new format if needed from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        # Load from a PyTorch state_dict
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(
-                prefix[:-1], {})
-            module._load_from_state_dict(state_dict, prefix, local_metadata,
-                                         True, missing_keys, unexpected_keys,
-                                         error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-
-        # Make sure we are able to load base models as well as derived models (with heads)
-        start_prefix = ''
-        model_to_load = model
-        if not hasattr(model, cls.base_model_prefix) and any(
-                s.startswith(cls.base_model_prefix)
-                for s in state_dict.keys()):
-            start_prefix = cls.base_model_prefix + '.'
-        if hasattr(model, cls.base_model_prefix) and not any(
-                s.startswith(cls.base_model_prefix)
-                for s in state_dict.keys()):
-            model_to_load = getattr(model, cls.base_model_prefix)
-
-        load(model_to_load, prefix=start_prefix)
-        if len(missing_keys) > 0:
-            logger.info(
-                "Weights of {} not initialized from pretrained model: {}".
-                format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Weights from pretrained model not used in {}: {}".format(
-                    model.__class__.__name__, unexpected_keys))
-        if len(
-                error_msgs
-        ) == 2 and "size mismatch for cls.seq_relationship.weight" in error_msgs[
-                0]:
-            logger.info('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                model.__class__.__name__, "\n\t".join(error_msgs)))
-        elif len(error_msgs) > 0:
-            raise RuntimeError(
-                'Error(s) in loading state_dict for {}:\n\t{}'.format(
-                    model.__class__.__name__, "\n\t".join(error_msgs)))
-
-        if hasattr(model, 'tie_weights'):
-            model.tie_weights(
-            )  # make sure word embedding weights are still tied
-
-        # Set model in evaluation mode to desactivate DropOut modules by default
-        model.eval()
-
-        if output_loading_info:
-            loading_info = {
-                "missing_keys": missing_keys,
-                "unexpected_keys": unexpected_keys,
-                "error_msgs": error_msgs
-            }
-            return model, loading_info
-
-        return model
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {"input_ids": input_ids}
-
-    def _do_output_past(self, outputs):
-        has_output_past = hasattr(self.config,
-                                  "output_past") and self.config.output_past
-        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
-
-        if has_output_past and not has_mem_len and len(outputs) > 1:
-            return True
-        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
-            return True
-
-        return False
-
-    def generate(
-        self,
-        input_ids=None,
-        max_length=None,
-        do_sample=None,
-        num_beams=None,
-        temperature=None,
-        top_k=None,
-        top_p=None,
-        repetition_penalty=None,
-        bos_token_id=None,
-        pad_token_id=None,
-        eos_token_ids=None,
-        length_penalty=None,
-        num_return_sequences=None,
-    ):
-        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
-        and beam-search.
-
-        Adapted in part from `Facebook's XLM beam search code`_.
-
-        .. _`Facebook's XLM beam search code`:
-           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
-
-
-        Parameters:
-
-            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
-                The sequence used as a prompt for the generation. If `None` the method initializes
-                it as an empty `torch.LongTensor` of shape `(1,)`.
-
-            max_length: (`optional`) int
-                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
-
-            do_sample: (`optional`) bool
-                If set to `False` greedy decoding is used. Otherwise sampling is used. Default to greedy sampling.
-
-            num_beams: (`optional`) int
-                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
-
-            temperature: (`optional`) float
-                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
-
-            top_k: (`optional`) int
-                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-
-            top_p: (`optional`) float
-                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-
-            repetition_penalty: (`optional`) float
-                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
-
-            bos_token_id: (`optional`) int
-                Beginning of sentence token if no prompt is provided. Default to 0.
-
-            eos_token_ids: (`optional`) int or list of int
-                End of sequence token or list of tokens to stop the generation. Default to 0.
-            length_penalty: (`optional`) float
-                Exponential penalty to the length. Default to 1.
-
-            num_return_sequences: (`optional`) int
-                The number of independently computed returned sequences for each element in the batch. Default to 1.
-
-        Examples::
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            outputs = model.generate(max_length=40, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id)  # do greedy decoding without beam search
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, do_sample=True, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[0][i], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, bos_token_id=tokenizer.bos_token_id, eos_token_ids=tokenizer.eos_token_id, num_beams=3)  # generate sequences using greedy beam search decoding (3 beams)
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences using using greedy search
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-        """
-
-        # We cannot generate if the model does not have a LM head
-        if self.get_output_embeddings() is None:
-            raise AttributeError(
-                "You tried to generate sequences with a model that does not have a LM Head."
-                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`)"
-            )
-
-        max_length = max_length if max_length is not None else self.config.max_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = temperature if temperature is not None else self.config.temperature
-        top_k = top_k if top_k is not None else self.config.top_k
-        top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
-        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
-        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        num_return_sequences = (num_return_sequences
-                                if num_return_sequences is not None else
-                                self.config.num_return_sequences)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[
-                0]  # overriden by the input batch_size
-        else:
-            batch_size = 1
-        if isinstance(eos_token_ids, int):
-            eos_token_ids = [eos_token_ids]
-
-        assert isinstance(
-            max_length, int
-        ) and max_length > 0, "`max_length` should be a strictely positive integer."
-        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(
-            num_beams, int
-        ) and num_beams > 0, "`num_beams` should be a strictely positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
-        assert isinstance(
-            top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
-        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
-        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
-        assert isinstance(
-            bos_token_id, int
-        ) and bos_token_id >= 0, "`bos_token_id` should be a positive integer."
-        assert isinstance(
-            pad_token_id, int
-        ) and pad_token_id >= 0, "`pad_token_id` should be a positive integer."
-        assert isinstance(eos_token_ids, (list, tuple)) and (
-            e >= 0 for e in eos_token_ids
-        ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
-        assert (
-            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
-
-        if input_ids is None:
-            input_ids = torch.full((batch_size, 1),
-                                   bos_token_id,
-                                   dtype=torch.long,
-                                   device=next(self.parameters()).device)
-        else:
-            assert input_ids.dim(
-            ) == 2, "Input prompt should be of shape (batch_size, sequence length)."
-
-        # current position and vocab size
-        cur_len = input_ids.shape[1]
-        vocab_size = self.config.vocab_size
-
-        if num_return_sequences != 1:
-            # Expand input to num return sequences
-            input_ids = input_ids.unsqueeze(1).expand(batch_size,
-                                                      num_return_sequences,
-                                                      cur_len)
-            input_ids = input_ids.contiguous().view(
-                batch_size * num_return_sequences,
-                cur_len)  # (batch_size * num_return_sequences, cur_len)
-            effective_batch_size = batch_size * num_return_sequences
-        else:
-            effective_batch_size = batch_size
-
-        if num_beams > 1:
-            output = self._generate_beam_search(
-                input_ids,
-                cur_len,
-                max_length,
-                do_sample,
-                temperature,
-                top_k,
-                top_p,
-                repetition_penalty,
-                pad_token_id,
-                eos_token_ids,
-                effective_batch_size,
-                length_penalty,
-                num_beams,
-                vocab_size,
-            )
-        else:
-            output = self._generate_no_beam_search(
-                input_ids,
-                cur_len,
-                max_length,
-                do_sample,
-                temperature,
-                top_k,
-                top_p,
-                repetition_penalty,
-                pad_token_id,
-                eos_token_ids,
-                effective_batch_size,
-            )
-
-        if num_return_sequences != 1:
-            for i in range(len(output)):
-                output[i] = output[i].view(batch_size, num_return_sequences,
-                                           -1)
-        return output
-
-    def _decode_step(self, input_ids, past):
-        model_inputs = self.prepare_inputs_for_generation(input_ids, past=past)
-        outputs = self(
-            **model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-        token_len = outputs[0].shape[1]
-        if self.od_labels_len == 0:
-            next_token_idx = token_len - 1
-        else:
-            if token_len == 2:
-                assert self._do_output_past(outputs)
-                next_token_idx = 1
-            else:
-                next_token_idx = token_len - self.od_labels_len - 1
-
-        next_token_logits = outputs[
-            0][:, next_token_idx, :]  # (batch_size * num_beams, vocab_size)
-        assert outputs[0].shape[1] == model_inputs['input_ids'].shape[1]
-
-        # if model has past, then set the past variable to speed up decoding
-        if self._do_output_past(outputs):
-            past = outputs[1]
-        return next_token_logits, past
-
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        pad_token_id,
-        eos_token_ids,
-        batch_size,
-    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
-        """
-        assert self.num_keep_best == 1, 'cannot generate >1 sentences in greedy search'
-        # current position / max lengths / length of generated sentences / unfinished sentences
-        unfinished_sents = []
-        if torch._C._get_tracing_state():
-            cur_unfinished = torch.ones(1, dtype=input_ids)
-        else:
-            cur_unfinished = input_ids.new(batch_size).fill_(1)
-
-        # log of scores for each sentence in the batch
-        logprobs = []
-
-        past = None
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids,
-                                                              past=past)
-            outputs = self(**model_inputs)
-            if cur_len == 1:
-                token_len = 2 + self.od_labels_len
-                next_token_idx = 1
-            else:
-                assert cur_len > 1
-                if not self._do_output_past(outputs):
-                    token_len = cur_len + 1 + self.od_labels_len
-                    next_token_idx = cur_len
-                else:
-                    token_len = 2
-                    next_token_idx = 1
-
-            assert outputs[0].shape[1] == token_len
-            next_token_logits = outputs[0][:, next_token_idx, :]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                for i in range(batch_size):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if next_token_logits[i, previous_token] < 0:
-                            next_token_logits[
-                                i, previous_token] *= repetition_penalty
-                        else:
-                            next_token_logits[
-                                i, previous_token] /= repetition_penalty
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    next_token_logits = next_token_logits / temperature
-                # Top-p/top-k filtering
-                next_token_logits = top_k_top_p_filtering(next_token_logits,
-                                                          top_k=top_k,
-                                                          top_p=top_p)
-                # Sample
-                next_token = torch.multinomial(F.softmax(next_token_logits,
-                                                         dim=-1),
-                                               num_samples=1).squeeze(1)
-            else:
-                # Greedy decoding
-                next_token = torch.argmax(next_token_logits, dim=-1)
-
-            # Compute scores
-            _scores = F.log_softmax(next_token_logits,
-                                    dim=-1)  # (batch_size, vocab_size)
-            _scores = torch.gather(_scores, -1,
-                                   next_token.unsqueeze(-1))  # (batch_size, 1)
-            logprobs.append(_scores)  # (batch_size, 1)
-            unfinished_sents.append(cur_unfinished)
-
-            # update generations and finished sentences
-            tokens_to_add = next_token * cur_unfinished + pad_token_id * (
-                1 - cur_unfinished)
-            input_ids = torch.cat(
-                [input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
-
-            #for t in input_ids:
-            #print(self.tokenizer.convert_ids_to_tokens(t.tolist()))
-
-            for eos_token_id in eos_token_ids:
-                cur_unfinished = cur_unfinished.mul(
-                    tokens_to_add.ne(eos_token_id).long())
-            cur_len = cur_len + 1
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if cur_unfinished.max() == 0:
-                break
-
-        # add eos_token_ids to unfinished sentences
-        if cur_len == max_length:
-            input_ids[:, -1].masked_fill_(cur_unfinished.to(dtype=torch.bool),
-                                          eos_token_ids[0])
-
-        logprobs = torch.cat(logprobs, dim=1)
-        unfinished_sents = torch.stack(unfinished_sents, dim=1).float()
-        sum_logprobs = (logprobs * unfinished_sents).sum(dim=1)
-        # return logprobs to keep consistent with beam search output
-        logprobs = sum_logprobs / unfinished_sents.sum(dim=1)
-
-        # pad to the same length, otherwise DataParallel will give error
-        pad_len = max_length - input_ids.shape[1]
-        if pad_len > 0:
-            padding_ids = input_ids.new(batch_size,
-                                        pad_len).fill_(pad_token_id)
-            input_ids = torch.cat([input_ids, padding_ids], dim=1)
-
-        # (batch_size, n_best, max_len), (batch_size, n_best)
-        return input_ids.unsqueeze(1), logprobs.unsqueeze(1)
-
-    def _generate_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        pad_token_id,
-        eos_token_ids,
-        batch_size,
-        length_penalty,
-        num_beams,
-        vocab_size,
-    ):
-        """ Generate sequences for each example with beam search.
-        """
-        # Expand input to num beams
-        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams,
-                                                  cur_len)
-        input_ids = input_ids.contiguous().view(
-            batch_size * num_beams,
-            cur_len)  # (batch_size * num_beams, cur_len)
-
-        # generated hypotheses
-        num_keep_best = self.num_keep_best
-        generated_hyps = [
-            BeamHypotheses(num_keep_best,
-                           max_length,
-                           length_penalty,
-                           early_stopping=False) for _ in range(batch_size)
-        ]
-        # NOTE: Expand >1 words to leave some spare tokens to keep the
-        # beam size, because some sentences may end here and cannot expand
-        # in the next level
-        TOPN_PER_BEAM = 2
-
-        # scores for each sentence in the beam
-        beam_scores = torch.zeros((batch_size, num_beams),
-                                  dtype=torch.float,
-                                  device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
-
-        # cache compute states
-        past = None
-
-        # done sentences
-        done = [False for _ in range(batch_size)]
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids,
-                                                              past=past)
-            outputs = self(**model_inputs
-                           )  # (batch_size * num_beams, cur_len, vocab_size)
-            if cur_len == 1:
-                token_len = 2 + self.od_labels_len
-                next_token_idx = 1
-            else:
-                assert cur_len > 1
-                if not self._do_output_past(outputs):
-                    token_len = cur_len + 1 + self.od_labels_len
-                    next_token_idx = cur_len
-                else:
-                    token_len = 2
-                    next_token_idx = 1
-
-            assert outputs[0].shape[1] == token_len
-            scores = outputs[
-                0][:,
-                   next_token_idx, :]  # (batch_size * num_beams, vocab_size)
-            assert outputs[0].shape[1] == model_inputs['input_ids'].shape[1]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                for i in range(batch_size * num_beams):
-                    for previous_token in set(input_ids[i].tolist()):
-                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                        if scores[i, previous_token] < 0:
-                            scores[i, previous_token] *= repetition_penalty
-                        else:
-                            scores[i, previous_token] /= repetition_penalty
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    scores = scores / temperature
-                # Top-p/top-k filtering
-                scores = top_k_top_p_filtering(
-                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
-                )  # (batch_size * num_beams, vocab_size)
-                # Sample [TOPN_PER_BEAM] next words for each beam (so we have some spare tokens and match output of greedy beam search)
-                next_words = torch.multinomial(
-                    F.softmax(scores, dim=-1), num_samples=TOPN_PER_BEAM
-                )  # (batch_size * num_beams, TOPN_PER_BEAM)
-                # Compute next scores
-                _scores = F.log_softmax(
-                    scores, dim=-1)  # (batch_size * num_beams, vocab_size)
-                _scores = torch.gather(
-                    _scores, -1,
-                    next_words)  # (batch_size * num_beams, TOPN_PER_BEAM)
-                next_scores = _scores + beam_scores[:, None].expand_as(
-                    _scores)  # (batch_size * num_beams, TOPN_PER_BEAM)
-                # Match shape of greedy beam search
-                beam_indices = torch.arange(num_beams) * vocab_size
-                beam_indices = beam_indices.repeat(
-                    batch_size, TOPN_PER_BEAM).to(next_words.device)
-                next_words = next_words.view(
-                    batch_size, TOPN_PER_BEAM *
-                    num_beams)  # (batch_size, TOPN_PER_BEAM * num_beams)
-                next_words = next_words + beam_indices
-                next_scores = next_scores.view(
-                    batch_size, TOPN_PER_BEAM *
-                    num_beams)  # (batch_size, TOPN_PER_BEAM * num_beams)
-            else:
-                # do greedy beam search
-                scores = F.log_softmax(
-                    scores, dim=-1)  # (batch_size * num_beams, vocab_size)
-                assert scores.size() == (batch_size * num_beams, vocab_size)
-                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                _scores = scores + beam_scores[:, None].expand_as(
-                    scores)  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                _scores = _scores.view(
-                    batch_size, num_beams *
-                    vocab_size)  # (batch_size, num_beams * vocab_size)
-                next_scores, next_words = torch.topk(_scores,
-                                                     TOPN_PER_BEAM * num_beams,
-                                                     dim=1,
-                                                     largest=True,
-                                                     sorted=True)
-
-            assert next_scores.size() == next_words.size() == (
-                batch_size, TOPN_PER_BEAM * num_beams)
-
-            # next batch beam content
-            # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
-            next_batch_beam = []
-
-            # for each sentence
-            for batch_ex in range(batch_size):
-
-                # if we are done with this sentence
-                done[batch_ex] = done[batch_ex] or generated_hyps[
-                    batch_ex].is_done(next_scores[batch_ex].max().item())
-                if done[batch_ex]:
-                    next_batch_beam.extend([(0, pad_token_id, 0)] *
-                                           num_beams)  # pad the batch
-                    continue
-
-                # next sentence beam content
-                next_sent_beam = []
-
-                # next words for this sentence
-                for idx, score in zip(next_words[batch_ex],
-                                      next_scores[batch_ex]):
-
-                    # get beam and word IDs
-                    beam_id = idx // vocab_size
-                    word_id = idx % vocab_size
-
-                    # end of sentence, or next word
-                    if word_id.item(
-                    ) in eos_token_ids or cur_len + 1 == max_length:
-                        generated_hyps[batch_ex].add(
-                            input_ids[batch_ex * num_beams +
-                                      beam_id, :cur_len].clone(), score.item())
-                    else:
-                        next_sent_beam.append(
-                            (score, word_id, batch_ex * num_beams + beam_id))
-
-                    # the beam for next step is full
-                    if len(next_sent_beam) == num_beams:
-                        break
-
-                # update next beam content
-                if cur_len + 1 == max_length:
-                    assert len(next_sent_beam) == 0
-                else:
-                    assert len(next_sent_beam) == num_beams
-
-                if len(next_sent_beam) == 0:
-                    next_sent_beam = [(0, pad_token_id, 0)
-                                      ] * num_beams  # pad the batch
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (batch_ex + 1)
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == batch_size * num_beams
-            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_words = input_ids.new([x[1] for x in next_batch_beam])
-            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
-
-            # re-order batch
-            input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)
-
-            # re-order internal states
-            if past:
-                reordered_past = []
-                for layer_past in past:
-                    # get the correct batch idx from layer past batch dim
-                    # batch dim of `past` and `mems` is at 1st position
-                    reordered_layer_past = [
-                        layer_past[i].unsqueeze(0).clone().detach()
-                        for i in beam_idx
-                    ]
-                    reordered_layer_past = torch.cat(reordered_layer_past,
-                                                     dim=0)
-                    # check that shape matches
-                    assert reordered_layer_past.shape == layer_past.shape
-                    reordered_past.append(reordered_layer_past)
-                past = tuple(reordered_past)
-
-            # update current length
-            cur_len = cur_len + 1
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-        # visualize hypotheses
-        # print([len(x) for x in generated_hyps], cur_len)
-        # globals().update( locals() );
-        # !import code; code.interact(local=vars())
-        # for ii in range(batch_size):
-        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
-        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
-        #     print("")
-
-        # select the best hypotheses
-        tgt_len = torch.ones(batch_size, num_keep_best, dtype=torch.long)
-        logprobs = torch.zeros(batch_size, num_keep_best,
-                               dtype=torch.float).fill_(-1e5).to(
-                                   input_ids.device)
-        all_best = []
-
-        for i, hypotheses in enumerate(generated_hyps):
-            best = []
-            hyp_scores = torch.tensor([x[0] for x in hypotheses.hyp])
-            _, best_indices = torch.topk(hyp_scores,
-                                         min(num_keep_best, len(hyp_scores)),
-                                         largest=True)
-            for best_idx, hyp_idx in enumerate(best_indices):
-                conf, best_hyp = hypotheses.hyp[hyp_idx]
-                best.append(best_hyp)
-                logprobs[i, best_idx] = conf
-                tgt_len[
-                    i, best_idx] = len(best_hyp) + 1  # +1 for the <EOS> symbol
-
-            all_best.append(best)
-
-        # generate target batch, pad to the same length
-        decoded = input_ids.new(batch_size, num_keep_best,
-                                max_length).fill_(pad_token_id)
-        for batch_idx, best in enumerate(all_best):
-            for best_idx, hypo in enumerate(best):
-                decoded[batch_idx,
-                        best_idx, :tgt_len[batch_idx, best_idx] - 1] = hypo
-                decoded[batch_idx, best_idx,
-                        tgt_len[batch_idx, best_idx] - 1] = eos_token_ids[0]
-
-        return decoded, logprobs
-
-
-def top_k_top_p_filtering(logits,
-                          top_k=0,
-                          top_p=1.0,
-                          filter_value=-float("Inf"),
-                          min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep),
-                    logits.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
-                                                                  None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
-                                        dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-            ..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(
-            1, sorted_indices, sorted_indices_to_remove)
-        logits[indices_to_remove] = filter_value
-    return logits
-
-
-class BeamHypotheses(object):
-
-    def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.n_hyp = n_hyp
-        self.hyp = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.hyp)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp)**self.length_penalty
-        if len(self) < self.n_hyp or score > self.worst_score:
-            self.hyp.append((score, hyp))
-            if len(self) > self.n_hyp:
-                sorted_scores = sorted([
-                    (s, idx) for idx, (s, _) in enumerate(self.hyp)
-                ])
-                del self.hyp[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-        if len(self) < self.n_hyp:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            return self.worst_score >= best_sum_logprobs / self.max_length**self.length_penalty
-
-
-class Conv1D(nn.Module):
-
-    def __init__(self, nf, nx):
-        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
-            Basically works like a Linear layer but the weights are transposed
-        """
-        super(Conv1D, self).__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf, )
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class PoolerStartLogits(nn.Module):
-    """ Compute SQuAD start_logits from sequence hidden states. """
-
-    def __init__(self, config):
-        super(PoolerStartLogits, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-
-    def forward(self, hidden_states, p_mask=None):
-        """ Args:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
-                invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
-    """
-
-    def __init__(self, config):
-        super(PoolerEndLogits, self).__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-
-    def forward(self,
-                hidden_states,
-                start_states=None,
-                start_positions=None,
-                p_mask=None):
-        """ Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-
-            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(
-                -1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(
-                -2, start_positions)  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen,
-                                               -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
-
-    def __init__(self, config):
-        super(PoolerAnswerClass, self).__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-
-    def forward(self,
-                hidden_states,
-                start_states=None,
-                start_positions=None,
-                cls_index=None):
-        """
-        Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-
-            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span.
-            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-                position of the CLS token. If None, take the last token.
-
-            note(Original repo):
-                no dependency on end_feature so that we can obtain one single `cls_logits`
-                for each sample
-        """
-        hsz = hidden_states.shape[-1]
-        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(
-                -1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(
-                -2)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None,
-                                  None].expand(-1, -1,
-                                               hsz)  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(
-                -2)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-class SQuADHead(nn.Module):
-    r""" A SQuAD head inspired by XLNet.
-
-    Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-
-    Inputs:
-        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
-            hidden states of sequence tokens
-        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the first token for the labeled span.
-        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the last token for the labeled span.
-        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-            position of the CLS token. If None, take the last token.
-        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            Whether the question has a possible answer in the paragraph or not.
-        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-            1.0 means token should be masked.
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size,)``
-            Log probabilities for the ``is_impossible`` label of the answers.
-    """
-
-    def __init__(self, config):
-        super(SQuADHead, self).__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-    def forward(self,
-                hidden_states,
-                start_positions=None,
-                end_positions=None,
-                cls_index=None,
-                is_impossible=None,
-                p_mask=None):
-        outputs = ()
-
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index,
-                      is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states,
-                                         start_positions=start_positions,
-                                         p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states,
-                                               start_positions=start_positions,
-                                               cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            outputs = (total_loss, ) + outputs
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits,
-                                        dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top,
-                dim=-1)  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(
-                -1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(
-                hidden_states, -2,
-                start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(
-                -1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states)  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded,
-                                         start_states=start_states,
-                                         p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits,
-                                      dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top,
-                dim=1)  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(
-                -1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(
-                -1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states,
-                                        start_log_probs)
-            cls_logits = self.answer_class(hidden_states,
-                                           start_states=start_states,
-                                           cls_index=cls_index)
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs,
-                       end_top_index, cls_logits) + outputs
-
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        # or (if labels are provided) (total_loss,)
-        return outputs
-
-
-class SequenceSummary(nn.Module):
-    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
-        Args of the config class:
-            summary_type:
-                - 'last' => [default] take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj: Add a projection after the vector extraction
-            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
-            summary_first_dropout: Add a dropout before the projection and activation
-            summary_last_dropout: Add a dropout after the projection and activation
-    """
-
-    def __init__(self, config):
-        super(SequenceSummary, self).__init__()
-
-        self.summary_type = config.summary_type if hasattr(
-            config, 'summary_use_proj') else 'last'
-        if config.summary_type == 'attn':
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(
-                    config, 'summary_proj_to_labels'
-            ) and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        self.activation = Identity()
-        if hasattr(
-                config,
-                'summary_activation') and config.summary_activation == 'tanh':
-            self.activation = nn.Tanh()
-
-        self.first_dropout = Identity()
-        if hasattr(
-                config,
-                'summary_first_dropout') and config.summary_first_dropout > 0:
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config,
-                   'summary_last_dropout') and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-    def forward(self, hidden_states, token_ids=None):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-            token_ids: [optional] index of the classification token if summary_type == 'token_ids',
-                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'token_ids' and token_ids is None:
-                    we take the last token of the sequence as classification token
-        """
-        if self.summary_type == 'last':
-            output = hidden_states[:, -1]
-        elif self.summary_type == 'first':
-            output = hidden_states[:, 0]
-        elif self.summary_type == 'mean':
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'token_ids':
-            if token_ids is None:
-                token_ids = torch.full_like(hidden_states[..., :1, :],
-                                            hidden_states.shape[-2] - 1,
-                                            dtype=torch.long)
-            else:
-                token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
-                token_ids = token_ids.expand((-1, ) * (token_ids.dim() - 1) +
-                                             (hidden_states.size(-1), ))
-            # shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, token_ids).squeeze(
-                -2)  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == 'attn':
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
-def prune_linear_layer(layer, index, dim=0):
-    """ Prune a linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1],
-                          new_size[0],
-                          bias=layer.bias is not None).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_conv1d_layer(layer, index, dim=1):
-    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
-        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_layer(layer, index, dim=None):
-    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    if isinstance(layer, nn.Linear):
-        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
-    elif isinstance(layer, Conv1D):
-        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
-    else:
-        raise ValueError("Can't prune layer of class {}".format(
-            layer.__class__))
diff --git a/AVLFormer/src/layers/bert/tokenization_bert.py b/AVLFormer/src/layers/bert/tokenization_bert.py
deleted file mode 100644
index 43880bd..0000000
--- a/AVLFormer/src/layers/bert/tokenization_bert.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import collections
-from io import open
-import logging
-import os
-from random import randint
-import unicodedata
-
-from src.utils.comm import is_main_process
-
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-if not is_main_process():
-    logger.disabled = True
-
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file': {
-        'bert-base-uncased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        'bert-large-uncased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        'bert-base-cased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-        'bert-large-cased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-        'bert-base-multilingual-uncased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-        'bert-base-multilingual-cased':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-        'bert-base-chinese':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-        'bert-base-german-cased':
-        "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-        'bert-large-uncased-whole-word-masking':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-        'bert-large-cased-whole-word-masking':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-        'bert-large-uncased-whole-word-masking-finetuned-squad':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-large-cased-whole-word-masking-finetuned-squad':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-        'bert-base-cased-finetuned-mrpc':
-        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-    'bert-base-german-cased': 512,
-    'bert-large-uncased-whole-word-masking': 512,
-    'bert-large-cased-whole-word-masking': 512,
-    'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
-    'bert-large-cased-whole-word-masking-finetuned-squad': 512,
-    'bert-base-cased-finetuned-mrpc': 512,
-}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
-        vocab[token] = index
-    return vocab
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class BertTokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a BertTokenizer.
-    :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
-
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
-        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
-            minimum of this value (if specified) and the underlying BERT model's sequence length.
-        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(self,
-                 vocab_file,
-                 do_lower_case=True,
-                 do_basic_tokenize=True,
-                 never_split=None,
-                 unk_token="[UNK]",
-                 sep_token="[SEP]",
-                 pad_token="[PAD]",
-                 cls_token="[CLS]",
-                 mask_token="[MASK]",
-                 tokenize_chinese_chars=True,
-                 **kwargs):
-        """Constructs a BertTokenizer.
-
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-            **do_basic_tokenize**: (`optional`) boolean (default True)
-                Whether to do basic tokenization before wordpiece.
-            **never_split**: (`optional`) list of string
-                List of tokens which will never be split during tokenization.
-                Only has an effect when do_basic_tokenize=True
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
-        super(BertTokenizer, self).__init__(unk_token=unk_token,
-                                            sep_token=sep_token,
-                                            pad_token=pad_token,
-                                            cls_token=cls_token,
-                                            mask_token=mask_token,
-                                            **kwargs)
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                .format(vocab_file))
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([
-            (ids, tok) for tok, ids in self.vocab.items()
-        ])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
-                                                      unk_token=self.unk_token)
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                    text, never_split=self.all_special_tokens):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _tokenize_for_pos_tag(self, text):
-        split_tokens = []
-        basic_tokens = []
-        sub_to_token_idx_map = []
-        if self.do_basic_tokenize:
-            for idx, token in enumerate(
-                    self.basic_tokenizer.tokenize(
-                        text, never_split=self.all_special_tokens)):
-                basic_tokens.append(token)
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-                    sub_to_token_idx_map.append(idx)
-            return (split_tokens, basic_tokens, sub_to_token_idx_map)
-        else:
-            raise ValueError(
-                f"_tokenize_for_pos_tag must set self.do_basic_tokenize as True"
-            )
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str/unicode) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
-        return out_string
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path,
-                                      VOCAB_FILES_NAMES['vocab_file'])
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(),
-                                             key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".
-                        format(vocab_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-        return (vocab_file, )
-
-    def get_random_token(self):
-        i = randint(0, len(self.vocab))
-        return self._convert_id_to_token(i)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """ Instantiate a BertTokenizer from pre-trained vocabulary files.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
-            if '-cased' in pretrained_model_name_or_path and kwargs.get(
-                    'do_lower_case', True):
-                logger.warning(
-                    "The pre-trained model you are loading is a cased model but you have not set "
-                    "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
-                    "you may want to check this behavior.")
-                kwargs['do_lower_case'] = False
-            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get(
-                    'do_lower_case', True):
-                logger.warning(
-                    "The pre-trained model you are loading is an uncased model but you have set "
-                    "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
-                    "but you may want to check this behavior.")
-                kwargs['do_lower_case'] = True
-
-        return super(BertTokenizer,
-                     cls)._from_pretrained(pretrained_model_name_or_path,
-                                           *inputs, **kwargs)
-
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=None,
-                 tokenize_chinese_chars=True):
-        """ Constructs a BasicTokenizer.
-
-        Args:
-            **do_lower_case**: Whether to lower case the input.
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-
-    def tokenize(self, text, never_split=None):
-        """ Basic Tokenization of a piece of text.
-            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-        """
-        never_split = self.never_split + (never_split
-                                          if never_split is not None else [])
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
-            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
diff --git a/AVLFormer/src/layers/bert/tokenization_utils.py b/AVLFormer/src/layers/bert/tokenization_utils.py
deleted file mode 100644
index 498ed2f..0000000
--- a/AVLFormer/src/layers/bert/tokenization_utils.py
+++ /dev/null
@@ -1,559 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-from io import open
-import json
-import logging
-import os
-
-import six
-from src.utils.comm import is_main_process
-
-from .file_utils import cached_path
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-if not is_main_process():
-    logger.disabled = True
-
-SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
-ADDED_TOKENS_FILE = 'added_tokens.json'
-
-
-class PreTrainedTokenizer(object):
-    """ An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
-
-        Derived class can set up a few special tokens to be used in common scripts and internals:
-            bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
-            additional_special_tokens = []
-
-        We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
-            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
-    """
-    vocab_files_names = {}
-    pretrained_vocab_files_map = {}
-    max_model_input_sizes = {}
-
-    SPECIAL_TOKENS_ATTRIBUTES = [
-        "bos_token", "eos_token", "unk_token", "sep_token", "pad_token",
-        "cls_token", "mask_token", "additional_special_tokens"
-    ]
-
-    @property
-    def bos_token(self):
-        if self._bos_token is None:
-            logger.error("Using bos_token, but it is not set yet.")
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        if self._eos_token is None:
-            logger.error("Using eos_token, but it is not set yet.")
-        return self._eos_token
-
-    @property
-    def unk_token(self):
-        if self._unk_token is None:
-            logger.error("Using unk_token, but it is not set yet.")
-        return self._unk_token
-
-    @property
-    def sep_token(self):
-        if self._sep_token is None:
-            logger.error("Using sep_token, but it is not set yet.")
-        return self._sep_token
-
-    @property
-    def pad_token(self):
-        if self._pad_token is None:
-            logger.error("Using pad_token, but it is not set yet.")
-        return self._pad_token
-
-    @property
-    def cls_token(self):
-        if self._cls_token is None:
-            logger.error("Using cls_token, but it is not set yet.")
-        return self._cls_token
-
-    @property
-    def mask_token(self):
-        if self._mask_token is None:
-            logger.error("Using mask_token, but it is not set yet.")
-        return self._mask_token
-
-    @property
-    def additional_special_tokens(self):
-        if self._additional_special_tokens is None:
-            logger.error(
-                "Using additional_special_tokens, but it is not set yet.")
-        return self._additional_special_tokens
-
-    @bos_token.setter
-    def bos_token(self, value):
-        self._bos_token = value
-
-    @eos_token.setter
-    def eos_token(self, value):
-        self._eos_token = value
-
-    @unk_token.setter
-    def unk_token(self, value):
-        self._unk_token = value
-
-    @sep_token.setter
-    def sep_token(self, value):
-        self._sep_token = value
-
-    @pad_token.setter
-    def pad_token(self, value):
-        self._pad_token = value
-
-    @cls_token.setter
-    def cls_token(self, value):
-        self._cls_token = value
-
-    @mask_token.setter
-    def mask_token(self, value):
-        self._mask_token = value
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-
-    def __init__(self, max_len=None, **kwargs):
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
-        self._additional_special_tokens = []
-
-        self.max_len = max_len if max_len is not None else int(1e12)
-        self.added_tokens_encoder = {}
-        self.added_tokens_decoder = {}
-
-        for key, value in kwargs.items():
-            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                setattr(self, key, value)
-
-    @classmethod
-    def from_pretrained(cls, *inputs, **kwargs):
-        return cls._from_pretrained(*inputs, **kwargs)
-
-    @classmethod
-    def _from_pretrained(cls,
-                         pretrained_model_name_or_path,
-                         cache_dir=None,
-                         *inputs,
-                         **kwargs):
-        """
-        Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
-        Download and cache the vocabulary files if needed.
-        """
-        s3_models = list(cls.max_model_input_sizes.keys())
-        vocab_files = {}
-        if pretrained_model_name_or_path in s3_models:
-            for file_id, map_list in cls.pretrained_vocab_files_map.items():
-                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-        else:
-            logger.info(
-                "Model name '{}' not found in model shortcut name list ({}). "
-                "Assuming '{}' is a path or url to a directory containing tokenizer files."
-                .format(pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path))
-            all_vocab_files_names = {
-                'added_tokens_file': ADDED_TOKENS_FILE,
-                'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE
-            }
-            all_vocab_files_names.update(cls.vocab_files_names)
-            for file_id, file_name in all_vocab_files_names.items():
-                if os.path.isdir(pretrained_model_name_or_path):
-                    full_file_name = os.path.join(
-                        pretrained_model_name_or_path, file_name)
-                else:
-                    full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
-                    logger.info(
-                        "Didn't find file {}. We won't load it.".format(
-                            full_file_name))
-                    full_file_name = None
-                vocab_files[file_id] = full_file_name
-            if all(full_file_name is None
-                   for full_file_name in vocab_files.values()):
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find tokenizer files"
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(s3_models),
-                        pretrained_model_name_or_path,
-                    ))
-                return None
-
-        # Get files from url, cache, or disk depending on the case
-        try:
-            resolved_vocab_files = {}
-            for file_id, file_path in vocab_files.items():
-                if file_path is None:
-                    resolved_vocab_files[file_id] = None
-                else:
-                    resolved_vocab_files[file_id] = cached_path(
-                        file_path, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in s3_models:
-                logger.error("Couldn't reach server to download vocabulary.")
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(s3_models), pretrained_model_name_or_path,
-                        str(vocab_files.keys())))
-            return None
-
-        for file_id, file_path in vocab_files.items():
-            if file_path == resolved_vocab_files[file_id]:
-                logger.info("loading file {}".format(file_path))
-            else:
-                logger.info("loading file {} from cache at {}".format(
-                    file_path, resolved_vocab_files[file_id]))
-
-        # Set max length if needed
-        if pretrained_model_name_or_path in cls.max_model_input_sizes:
-            # if we're using a pretrained model, ensure the tokenizer
-            # wont index sequences longer than the number of positional embeddings
-            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
-            if max_len is not None and isinstance(max_len, (int, float)):
-                kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)),
-                                        max_len)
-
-        # Merge resolved_vocab_files arguments in kwargs.
-        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
-        special_tokens_map_file = resolved_vocab_files.pop(
-            'special_tokens_map_file', None)
-        for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in kwargs:
-                kwargs[args_name] = file_path
-        if special_tokens_map_file is not None:
-            special_tokens_map = json.load(
-                open(special_tokens_map_file, encoding="utf-8"))
-            for key, value in special_tokens_map.items():
-                if key not in kwargs:
-                    kwargs[key] = value
-
-        # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **kwargs)
-
-        # Add supplementary tokens.
-        if added_tokens_file is not None:
-            added_tok_encoder = json.load(
-                open(added_tokens_file, encoding="utf-8"))
-            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-            tokenizer.added_tokens_encoder.update(added_tok_encoder)
-            tokenizer.added_tokens_decoder.update(added_tok_decoder)
-
-        return tokenizer
-
-    def save_pretrained(self, save_directory):
-        """ Save the tokenizer vocabulary files (with added tokens) and the
-            special-tokens-to-class-attributes-mapping to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Saving directory ({}) should be a directory".format(
-                save_directory))
-            return
-
-        special_tokens_map_file = os.path.join(save_directory,
-                                               SPECIAL_TOKENS_MAP_FILE)
-        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
-
-        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
-
-        with open(added_tokens_file, 'w', encoding='utf-8') as f:
-            if self.added_tokens_encoder:
-                out_str = json.dumps(self.added_tokens_decoder,
-                                     ensure_ascii=False)
-            else:
-                out_str = u"{}"
-            f.write(out_str)
-
-        vocab_files = self.save_vocabulary(save_directory)
-
-        return vocab_files + (special_tokens_map_file, added_tokens_file)
-
-    def save_vocabulary(self, save_directory):
-        """ Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
-            and special token mappings.
-            
-            Please use `save_pretrained()` to save the full Tokenizer state so that it can be
-            reloaded using the `from_pretrained(save_directory)` class method.
-        """
-        raise NotImplementedError
-
-    def vocab_size(self):
-        raise NotImplementedError
-
-    def __len__(self):
-        return self.vocab_size + len(self.added_tokens_encoder)
-
-    def add_tokens(self, new_tokens):
-        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-            vocabulary, they are added to the added_tokens_encoder with indices starting from
-            the last index of the current vocabulary.
-
-            Returns:
-                Number of tokens added to the vocabulary which can be used to correspondingly
-                    increase the size of the associated model embedding matrices.
-        """
-        if not new_tokens:
-            return 0
-
-        to_add_tokens = []
-        for token in new_tokens:
-            if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(
-                    self.unk_token):
-                to_add_tokens.append(token)
-                logger.info("Adding %s to the vocabulary", token)
-
-        added_tok_encoder = dict(
-            (tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        return len(to_add_tokens)
-
-    def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
-            to class attributes. If the special tokens are not in the vocabulary, they are added
-            to it and indexed starting from the last index of the current vocabulary.
-
-            Returns:
-                Number of tokens added to the vocabulary which can be used to correspondingly
-                    increase the size of the associated model embedding matrices.
-        """
-        if not special_tokens_dict:
-            return 0
-
-        added_special_tokens = self.add_tokens(special_tokens_dict.values())
-        for key, value in special_tokens_dict.items():
-            logger.info("Assigning %s to the %s key of the tokenizer", value,
-                        key)
-            setattr(self, key, value)
-
-        return added_special_tokens
-
-    def tokenize(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Take care of added tokens.
-        """
-
-        def split_on_tokens(tok_list, text):
-            if not text:
-                return []
-            if not tok_list:
-                return self._tokenize(text, **kwargs)
-            tok = tok_list[0]
-            split_text = text.split(tok)
-            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
-                        for sub_text in split_text), [])[:-1]
-
-        added_tokens = list(
-            self.added_tokens_encoder.keys()) + self.all_special_tokens
-        tokenized_text = split_on_tokens(added_tokens, text)
-        return tokenized_text
-
-    def tokenize_for_pos_tag(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Take care of added tokens.
-        """
-
-        def split_on_tokens(tok_list, text):
-            if not text:
-                return ([], [], [])
-            if not tok_list:
-                # (split_tokens, basic_tokens, sub_to_token_idx_map)
-                return self._tokenize_for_pos_tag(text, **kwargs)
-            tok = tok_list[0]
-            split_text = text.split(tok)
-            split_tokens, basic_tokens, sub_to_token_idx_map = [], [], []
-            for sub_text in split_text:
-                s_tok, b_tok, s2b_map = split_on_tokens(
-                    tok_list[1:], sub_text.strip())
-                sub_to_token_idx_map.extend(
-                    [i + len(basic_tokens)
-                     for i in s2b_map] + [len(s2b_map) + len(basic_tokens)])
-                split_tokens.extend(s_tok + [tok])
-                basic_tokens.extend(b_tok + [tok])
-            split_tokens = split_tokens[:-1]
-            basic_tokens = basic_tokens[:-1]
-            sub_to_token_idx_map = sub_to_token_idx_map[:-1]
-            return (split_tokens, basic_tokens, sub_to_token_idx_map)
-
-        added_tokens = list(
-            self.added_tokens_encoder.keys()) + self.all_special_tokens
-        tokenized_text = split_on_tokens(added_tokens, text)
-        return tokenized_text
-
-    def _tokenize(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Don't take care of added tokens.
-        """
-        raise NotImplementedError
-
-    def _tokenize_for_pos_tag(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Don't take care of added tokens.
-        """
-        raise NotImplementedError
-
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token or a sequence of tokens (str/unicode) in a integer id
-            (resp.) a sequence of ids, using the vocabulary.
-        """
-        if isinstance(tokens, str) or (six.PY2
-                                       and isinstance(tokens, unicode)):
-            return self._convert_token_to_id_with_added_voc(tokens)
-
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum sequence length "
-                "for this model ({} > {}). Running this sequence through the model will result in "
-                "indexing errors".format(len(ids), self.max_len))
-        return ids
-
-    def _convert_token_to_id_with_added_voc(self, token):
-        if token in self.added_tokens_encoder:
-            return self.added_tokens_encoder[token]
-        return self._convert_token_to_id(token)
-
-    def _convert_token_to_id(self, token):
-        raise NotImplementedError
-
-    def encode(self, text):
-        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-            same as self.convert_tokens_to_ids(self.tokenize(text)).
-        """
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
-
-            Args:
-                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
-        """
-        if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
-            else:
-                return self._convert_id_to_token(ids)
-        tokens = []
-        for index in ids:
-            if index in self.all_special_ids and skip_special_tokens:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
-        return tokens
-
-    def _convert_id_to_token(self, index):
-        raise NotImplementedError
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string.
-            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
-            but we often want to remove sub-word tokenization artifacts at the same time.
-        """
-        return ' '.join(self.convert_ids_to_tokens(tokens))
-
-    def decode(self,
-               token_ids,
-               skip_special_tokens=False,
-               clean_up_tokenization_spaces=True):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-            with options to remove special tokens and clean up tokenization spaces.
-        """
-        filtered_tokens = self.convert_ids_to_tokens(
-            token_ids, skip_special_tokens=skip_special_tokens)
-        text = self.convert_tokens_to_string(filtered_tokens)
-        if clean_up_tokenization_spaces:
-            text = clean_up_tokenization(text)
-        return text
-
-    @property
-    def special_tokens_map(self):
-        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
-
-    @property
-    def all_special_tokens(self):
-        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
-        all_toks = []
-        set_attr = self.special_tokens_map
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (attr_value if isinstance(
-                attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(set(all_toks))
-        return all_toks
-
-    @property
-    def all_special_ids(self):
-        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
-        all_toks = self.all_special_tokens
-        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
-        return all_ids
-
-
-def clean_up_tokenization(out_string):
-    out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(
-        ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
-            " n't",
-            "n't").replace(" 'm", "'m").replace(" do not", " don't").replace(
-                " 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-    return out_string
diff --git a/AVLFormer/src/modeling/load_bert.py b/AVLFormer/src/modeling/load_bert.py
deleted file mode 100644
index 20156a8..0000000
--- a/AVLFormer/src/modeling/load_bert.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from src.layers.bert import BertConfig, BertForImageCaptioning, BertTokenizer
-from src.utils.logger import LOGGER as logger
-
-
-def get_bert_model(args):
-    # Load pretrained bert and tokenizer based on training configs
-    config_class, model_class, tokenizer_class = BertConfig, BertForImageCaptioning, BertTokenizer
-    config = config_class.from_pretrained(args.config_name if args.config_name else \
-            args.model_name_or_path, num_labels=2, finetuning_task='image_captioning')
-
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \
-            else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    config.img_feature_type = 'frcnn'
-    config.hidden_dropout_prob = args.drop_out
-    config.loss_type = 'classification'
-    config.tie_weights = args.tie_weights
-    config.freeze_embedding = args.freeze_embedding
-    config.label_smoothing = args.label_smoothing
-    config.drop_worst_ratio = args.drop_worst_ratio
-    config.drop_worst_after = args.drop_worst_after
-    config.lambda_ = args.lambda_
-    print('Lambda ratio: ', config.lambda_)
-
-    # update model structure if specified in arguments
-    update_params = [
-        'img_feature_dim', 'num_hidden_layers', 'hidden_size',
-        'num_attention_heads', 'intermediate_size'
-    ]
-    model_structure_changed = [False] * len(update_params)
-    # model_structure_changed[0] = True  # cclin hack
-    for idx, param in enumerate(update_params):
-        arg_param = getattr(args, param)
-        # bert-base-uncased do not have img_feature_dim
-        config_param = getattr(config, param) if hasattr(config, param) else -1
-        if arg_param > 0 and arg_param != config_param:
-            logger.info(
-                f"Update config parameter {param}: {config_param} -> {arg_param}"
-            )
-            setattr(config, param, arg_param)
-            model_structure_changed[idx] = True
-    if any(model_structure_changed):
-        assert config.hidden_size % config.num_attention_heads == 0
-        if args.load_partial_weights:
-            # can load partial weights when changing layer only.
-            assert not any(model_structure_changed[2:]), "Cannot load partial weights " \
-                "when any of ({}) is changed.".format(', '.join(update_params[2:]))
-            model = model_class.from_pretrained(
-                args.model_name_or_path,
-                from_tf=bool('.ckpt' in args.model_name_or_path),
-                config=config)
-            logger.info("Load partial weights for bert layers.")
-        else:
-            model = model_class(config=config)  # init from scratch
-            logger.info("Init model from scratch.")
-    else:
-        model = model_class.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool('.ckpt' in args.model_name_or_path),
-            config=config)
-        logger.info(f"Load pretrained model: {args.model_name_or_path}")
-
-    total_params = sum(p.numel() for p in model.parameters())
-    logger.info(f'Model total parameters: {total_params}')
-    return model, config, tokenizer
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/load_passt.py b/AVLFormer/src/modeling/load_passt.py
deleted file mode 100644
index 48d765e..0000000
--- a/AVLFormer/src/modeling/load_passt.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from src.modeling.passt.passt import get_model
-from src.modeling.passt.preprocess import AugmentMelSTFT
-import torch.nn as nn
-
-
-class MyPasst(nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.mel = AugmentMelSTFT(n_mels=128,
-                                  sr=32000,
-                                  win_length=800,
-                                  hopsize=320,
-                                  n_fft=1024,
-                                  freqm=48,
-                                  timem=192,
-                                  htk=False,
-                                  fmin=0.0,
-                                  fmax=None,
-                                  norm=1,
-                                  fmin_aug_range=10,
-                                  fmax_aug_range=2000)
-        self.net = get_model(arch="passt_s_swa_p16_128_ap476",
-                             pretrained=True,
-                             n_classes=0,
-                             in_channels=1,
-                             fstride=10,
-                             tstride=10,
-                             input_fdim=128,
-                             input_tdim=998,
-                             u_patchout=0,
-                             s_patchout_t=40,
-                             s_patchout_f=4)
-        self.dyn_norm = False
-        self.linear = nn.Linear(in_features=768, out_features=512)
-
-    def mel_forward(self, x):
-        old_shape = x.size()
-        x = x.reshape(-1, old_shape[2])
-        x = self.mel(x)
-        x = x.reshape(old_shape[0], old_shape[1], x.shape[1], x.shape[2])
-
-        return x
-
-    def forward(self, x):
-        x = self.mel_forward(x)
-        embed = self.net(x)
-        # feature downsample
-        embed = self.linear(embed)
-        return embed
-
-    def freeze(self):
-        for _, p in self.mel.named_parameters():
-            p.requires_grad = False
-        for _, p in self.net.named_parameters():
-            p.requires_grad = False
diff --git a/AVLFormer/src/modeling/load_swin.py b/AVLFormer/src/modeling/load_swin.py
deleted file mode 100644
index 08be937..0000000
--- a/AVLFormer/src/modeling/load_swin.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from src.modeling.video_swin.config import Config
-from src.modeling.video_swin.swin_transformer import SwinTransformer3D
-from src.utils.logger import LOGGER as logger
-import torch
-
-
-def get_swin_model(args):
-    if int(args.img_res) == 384:
-        assert args.vidswin_size == "large"
-        config_path = 'src/modeling/video_swin/swin_%s_384_patch244_window81212_kinetics%s_22k.py' % (
-            args.vidswin_size, args.kinetics)
-        model_path = './models/video_swin_transformer/swin_%s_384_patch244_window81212_kinetics%s_22k.pth' % (
-            args.vidswin_size, args.kinetics)
-    else:
-        # in the case that args.img_res == '224'
-        config_path = 'src/modeling/video_swin/swin_%s_patch244_window877_kinetics%s_22k.py' % (
-            args.vidswin_size, args.kinetics)
-        model_path = './models/video_swin_transformer/swin_%s_patch244_window877_kinetics%s_22k.pth' % (
-            args.vidswin_size, args.kinetics)
-    if args.pretrained_2d:
-        config_path = 'src/modeling/video_swin/swin_base_patch244_window877_kinetics400_22k.py'
-        model_path = './models/swin_transformer/swin_base_patch4_window7_224_22k.pth'
-
-    logger.info(f'video swin (config path): {config_path}')
-    if args.pretrained_checkpoint == '':
-        logger.info(f'video swin (model path): {model_path}')
-    cfg = Config.fromfile(config_path)
-    pretrained_path = model_path if args.pretrained_2d else None
-    backbone = SwinTransformer3D(
-        pretrained=pretrained_path,
-        pretrained2d=args.pretrained_2d,
-        patch_size=cfg.model['backbone']['patch_size'],
-        in_chans=3,
-        embed_dim=cfg.model['backbone']['embed_dim'],
-        depths=cfg.model['backbone']['depths'],
-        num_heads=cfg.model['backbone']['num_heads'],
-        window_size=cfg.model['backbone']['window_size'],
-        mlp_ratio=4.,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.2,
-        norm_layer=torch.nn.LayerNorm,
-        patch_norm=cfg.model['backbone']['patch_norm'],
-        frozen_stages=-1,
-        use_checkpoint=False)
-
-    video_swin = myVideoSwin(args=args, cfg=cfg, backbone=backbone)
-
-    if not args.pretrained_2d:
-        checkpoint_3d = torch.load(model_path, map_location='cpu')
-        video_swin.load_state_dict(checkpoint_3d['state_dict'], strict=False)
-    else:
-        video_swin.backbone.init_weights()
-    return video_swin
-
-
-def reload_pretrained_swin(video_swin, args):
-    if not args.reload_pretrained_swin:
-        return video_swin
-    if int(args.img_res) == 384:
-        model_path = './models/video_swin_transformer/swin_%s_384_patch244_window81212_kinetics%s_22k.pth' % (
-            args.vidswin_size, args.kinetics)
-    else:
-        # in the case that args.img_res == '224'
-        model_path = './models/video_swin_transformer/swin_%s_patch244_window877_kinetics%s_22k.pth' % (
-            args.vidswin_size, args.kinetics)
-
-    checkpoint_3d = torch.load(model_path, map_location='cpu')
-    missing, unexpected = video_swin.load_state_dict(
-        checkpoint_3d['state_dict'], strict=False)
-    logger.info(f"re-loaded video_swin_transformer from {model_path}")
-
-    logger.info(f"Missing keys in loaded video_swin_transformerr: {missing}")
-    logger.info(
-        f"Unexpected keys in loaded video_swin_transformer: {unexpected}")
-    return video_swin
-
-
-class myVideoSwin(torch.nn.Module):
-
-    def __init__(self, args, cfg, backbone):
-        super(myVideoSwin, self).__init__()
-        self.backbone = backbone
-        self.use_grid_feature = args.grid_feat
-
-    def forward(self, x):
-        x = self.backbone(x)
-        return x
diff --git a/AVLFormer/src/modeling/passt/helpers/vit_helpers.py b/AVLFormer/src/modeling/passt/helpers/vit_helpers.py
deleted file mode 100644
index 11bef9a..0000000
--- a/AVLFormer/src/modeling/passt/helpers/vit_helpers.py
+++ /dev/null
@@ -1,263 +0,0 @@
-"""
-Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-
-"""
-from copy import deepcopy
-import math
-import warnings
-
-from timm.models.helpers import load_pretrained
-import torch
-from torch import nn
-
-
-def overlay_external_default_cfg(default_cfg, kwargs):
-    """ Overlay 'external_default_cfg' in kwargs on top of default_cfg arg.
-    """
-    external_default_cfg = kwargs.pop('external_default_cfg', None)
-    if external_default_cfg:
-        default_cfg.pop('url', None)  # url should come from external cfg
-        default_cfg.pop('hf_hub',
-                        None)  # hf hub id should come from external cfg
-        default_cfg.update(external_default_cfg)
-
-
-def filter_kwargs(kwargs, names):
-    if not kwargs or not names:
-        return
-    for n in names:
-        kwargs.pop(n, None)
-
-
-def set_default_kwargs(kwargs, names, default_cfg):
-    for n in names:
-        # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
-        # default_cfg has one input_size=(C, H ,W) entry
-        if n == 'img_size':
-            input_size = default_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[-2:])
-        elif n == 'in_chans':
-            input_size = default_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[0])
-        else:
-            default_val = default_cfg.get(n, None)
-            if default_val is not None:
-                kwargs.setdefault(n, default_cfg[n])
-
-
-def update_default_cfg_and_kwargs(default_cfg, kwargs, kwargs_filter):
-    """ Update the default_cfg and kwargs before passing to model
-
-    FIXME this sequence of overlay default_cfg, set default kwargs, filter kwargs
-    could/should be replaced by an improved configuration mechanism
-
-    Args:
-        default_cfg: input default_cfg (updated in-place)
-        kwargs: keyword args passed to model build fn (updated in-place)
-        kwargs_filter: keyword arg keys that must be removed before model __init__
-    """
-    # Overlay default cfg values from `external_default_cfg` if it exists in kwargs
-    overlay_external_default_cfg(default_cfg, kwargs)
-    # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
-    default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
-    if default_cfg.get('fixed_input_size', False):
-        # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
-        default_kwarg_names += ('img_size', )
-    set_default_kwargs(kwargs,
-                       names=default_kwarg_names,
-                       default_cfg=default_cfg)
-    # Filter keyword args for task specific model variants (some 'features only' models, etc.)
-    filter_kwargs(kwargs, names=kwargs_filter)
-
-
-def drop_path(x, drop_prob: float = 0., training: bool = False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0], ) + (1, ) * (
-        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(
-        shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-
-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
-
-def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == 'fan_in':
-        denom = fan_in
-    elif mode == 'fan_out':
-        denom = fan_out
-    elif mode == 'fan_avg':
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
-    elif distribution == "normal":
-        tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
-
-
-def build_model_with_cfg(model_cls,
-                         variant: str,
-                         pretrained: bool,
-                         default_cfg: dict,
-                         model_cfg=None,
-                         feature_cfg=None,
-                         pretrained_strict: bool = True,
-                         pretrained_filter_fn=None,
-                         pretrained_custom_load=False,
-                         kwargs_filter=None,
-                         **kwargs):
-    """ Build model with specified default_cfg and optional model_cfg
-
-    This helper fn aids in the construction of a model including:
-      * handling default_cfg and associated pretained weight loading
-      * passing through optional model_cfg for models with config based arch spec
-      * features_only model adaptation
-      * pruning config / model adaptation
-
-    Args:
-        model_cls (nn.Module): model class
-        variant (str): model variant name
-        pretrained (bool): load pretrained weights
-        default_cfg (dict): model's default pretrained/task config
-        model_cfg (Optional[Dict]): model's architecture config
-        feature_cfg (Optional[Dict]: feature extraction adapter config
-        pretrained_strict (bool): load pretrained weights strictly
-        pretrained_filter_fn (Optional[Callable]): filter callable for pretrained weights
-        pretrained_custom_load (bool): use custom load fn, to load numpy or other non PyTorch weights
-        kwargs_filter (Optional[Tuple]): kwargs to filter before passing to model
-        **kwargs: model args passed through to model __init__
-    """
-    pruned = kwargs.pop('pruned', False)
-    features = False
-    feature_cfg = feature_cfg or {}
-    default_cfg = deepcopy(default_cfg) if default_cfg else {}
-    update_default_cfg_and_kwargs(default_cfg, kwargs, kwargs_filter)
-    default_cfg.setdefault('architecture', variant)
-
-    # Setup for feature extraction wrapper done at end of this fn
-    if kwargs.pop('features_only', False):
-        features = True
-        feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4))
-        if 'out_indices' in kwargs:
-            feature_cfg['out_indices'] = kwargs.pop('out_indices')
-
-    # Build the model
-    model = model_cls(
-        **kwargs) if model_cfg is None else model_cls(cfg=model_cfg, **kwargs)
-    model.pretrained_cfg = default_cfg
-    model.default_cfg = default_cfg
-
-    # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
-    num_classes_pretrained = 0 if features else getattr(
-        model, 'num_classes', kwargs.get('num_classes', 1000))
-    if pretrained:
-        if pretrained_custom_load:
-            load_custom_pretrained(model)
-        else:
-            load_pretrained(model,
-                            num_classes=num_classes_pretrained,
-                            in_chans=kwargs.get('in_chans', 3),
-                            filter_fn=pretrained_filter_fn,
-                            strict=pretrained_strict)
-    return model
diff --git a/AVLFormer/src/modeling/passt/passt.py b/AVLFormer/src/modeling/passt/passt.py
deleted file mode 100644
index f76fa11..0000000
--- a/AVLFormer/src/modeling/passt/passt.py
+++ /dev/null
@@ -1,1437 +0,0 @@
-"""
-Most of this code comes from the timm  library.
-We tried to disentangle from the timm library version.
-
-Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-
-"""
-from collections import OrderedDict
-from copy import deepcopy
-from functools import partial
-import logging
-import math
-import warnings
-
-from pyexpat import features
-from timm.models.layers.helpers import to_2tuple
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers.vit_helpers import (
-    DropPath,
-    build_model_with_cfg,
-    trunc_normal_,
-    update_default_cfg_and_kwargs,
-)
-
-_logger = logging.getLogger()
-
-IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
-IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
-IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000,
-        'input_size': (3, 224, 224),
-        'pool_size': None,
-        'crop_pct': .9,
-        'interpolation': 'bicubic',
-        'fixed_input_size': True,
-        'mean': IMAGENET_INCEPTION_MEAN,
-        'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'patch_embed.proj',
-        'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # patch models (weights from official Google JAX impl)
-    'vit_tiny_patch16_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'
-    ),
-    'vit_tiny_patch16_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_small_patch32_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'
-    ),
-    'vit_small_patch32_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_small_patch16_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'
-    ),
-    'vit_small_patch16_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_base_patch32_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'
-    ),
-    'vit_base_patch32_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_base_patch16_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'
-    ),
-    'vit_base_patch16_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_large_patch32_224':
-    _cfg(
-        url='',  # no official model weights for this combo, only for in21k
-    ),
-    'vit_large_patch32_384':
-    _cfg(
-        url=
-        'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'vit_large_patch16_224':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz'
-    ),
-    'vit_large_patch16_384':
-    _cfg(
-        url='https://storage.googleapis.com/vit_models/augreg/'
-        'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz',
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-
-    # patch models, imagenet21k (weights from official Google JAX impl)
-    'vit_tiny_patch16_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz',
-        num_classes=21843),
-    'vit_small_patch32_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
-        num_classes=21843),
-    'vit_small_patch16_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
-        num_classes=21843),
-    'vit_base_patch32_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0.npz',
-        num_classes=21843),
-    'vit_base_patch16_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz',
-        num_classes=21843),
-    'vit_large_patch32_224_in21k':
-    _cfg(
-        url=
-        'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
-        num_classes=21843),
-    'vit_large_patch16_224_in21k':
-    _cfg(
-        url=
-        'https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz',
-        num_classes=21843),
-    'vit_huge_patch14_224_in21k':
-    _cfg(url=
-         'https://storage.googleapis.com/vit_models/imagenet21k/ViT-H_14.npz',
-         hf_hub='timm/vit_huge_patch14_224_in21k',
-         num_classes=21843),
-
-    # SAM trained models (https://arxiv.org/abs/2106.01548)
-    'vit_base_patch32_sam_224':
-    _cfg(url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz'),
-    'vit_base_patch16_sam_224':
-    _cfg(url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz'),
-
-    # deit models (FB weights)
-    'deit_tiny_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD),
-    'deit_small_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD),
-    'deit_base_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD),
-    'deit_base_patch16_384':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(3, 384, 384),
-        crop_pct=1.0),
-    'deit_tiny_distilled_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        classifier=('head', 'head_dist')),
-    'deit_small_distilled_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        classifier=('head', 'head_dist')),
-    'deit_base_distilled_patch16_224':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        classifier=('head', 'head_dist')),
-    'deit_base_distilled_patch16_384':
-    _cfg(
-        url=
-        'https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(3, 384, 384),
-        crop_pct=1.0,
-        classifier=('head', 'head_dist')),
-
-    # ViT ImageNet-21K-P pretraining by MILL
-    'vit_base_patch16_224_miil_in21k':
-    _cfg(
-        url=
-        'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth',
-        mean=(0, 0, 0),
-        std=(1, 1, 1),
-        crop_pct=0.875,
-        interpolation='bilinear',
-        num_classes=11221,
-    ),
-    'vit_base_patch16_224_miil':
-    _cfg(
-        url=
-        'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm'
-        '/vit_base_patch16_224_1k_miil_84_4.pth',
-        mean=(0, 0, 0),
-        std=(1, 1, 1),
-        crop_pct=0.875,
-        interpolation='bilinear',
-    ),
-    # PaSST
-    'passt_s_swa_p16_128_ap476':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.1-audioset/passt-s-f128-p16-s10-ap.476-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_p16_128_ap4761':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s10-ap.4761-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_p16_128_ap472':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s10-ap.472.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_p16_s16_128_ap468':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s16-ap.468.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_p16_s16_128_ap473':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s16-ap.473-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_p16_s14_128_ap471':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s14-ap.471-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_p16_s14_128_ap469':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s14-ap.469.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_p16_s12_128_ap473':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s12-ap.473-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_p16_s12_128_ap470':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.2-audioset/passt-s-f128-p16-s12-ap.470.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 998),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_f128_stfthop100_p16_s10_ap473':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.3-audioset/passt-s-f128-stfthop100-p16-s10-ap.473-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 3200),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt_s_swa_f128_stfthop160_p16_s10_ap473':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.3-audioset/passt-s-f128-stfthop160-p16-s10-ap.473-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 2000),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt-s-f128-20sec-p16-s10-ap474-swa':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.5/passt-s-f128-20sec-p16-s10-ap.474-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 2000),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'passt-s-f128-30sec-p16-s10-ap473-swa':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.5/passt-s-f128-30sec-p16-s10-ap.473-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 3000),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=527),
-    'openmic2008_passt_u_f128_p16_s10_ap85_swa':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.4-openmic/openmic2008.passt-u-f128-p16-s10-ap.85-swa.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 3200),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=20),
-    'openmic2008_passt_u_f128_p16_s10_ap85  ':
-    _cfg(
-        url=
-        'https://github.com/kkoutini/PaSST/releases/download/v0.0.4-openmic/openmic2008.passt-u-f128-p16-s10-ap.85.pt',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        input_size=(1, 128, 2000),
-        crop_pct=1.0,
-        classifier=('head.1', 'head_dist'),
-        num_classes=20),
-}
-
-
-def adapt_input_conv(in_chans, conv_weight):
-    conv_type = conv_weight.dtype
-    conv_weight = conv_weight.float(
-    )  # Some weights are in torch.half, ensure it's float for sum on CPU
-    O, I, J, K = conv_weight.shape
-    if in_chans == 1:
-        if I > 3:
-            assert conv_weight.shape[1] % 3 == 0
-            # For models with space2depth stems
-            conv_weight = conv_weight.reshape(O, I // 3, 3, J, K)
-            conv_weight = conv_weight.sum(dim=2, keepdim=False)
-        else:
-            conv_weight = conv_weight.sum(dim=1, keepdim=True)
-    elif in_chans != 3:
-        if I != 3:
-            raise NotImplementedError(
-                'Weight format not supported by conversion.')
-        else:
-            # NOTE this strategy should be better than random init, but there could be other combinations of
-            # the original RGB input layer weights that'd work better for specific cases.
-            repeat = int(math.ceil(in_chans / 3))
-            conv_weight = conv_weight.repeat(1, repeat, 1,
-                                             1)[:, :in_chans, :, :]
-            conv_weight *= (3 / float(in_chans))
-    conv_weight = conv_weight.to(conv_type)
-    return conv_weight
-
-
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
-    """
-
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-first_RUN = False
-
-
-class PatchEmbed(nn.Module):
-    """ 2D Image to Patch Embedding
-    """
-
-    def __init__(self,
-                 img_size=224,
-                 patch_size=16,
-                 stride=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 norm_layer=None,
-                 flatten=True):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        stride = to_2tuple(stride)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.stride = stride
-        self.grid_size = (img_size[0] // stride[0], img_size[1] // stride[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        self.embed_dim = embed_dim
-        self.proj = nn.Conv2d(in_chans,
-                              embed_dim,
-                              kernel_size=patch_size,
-                              stride=stride)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        if not (H == self.img_size[0] and W == self.img_size[1]):
-            warnings.warn(
-                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-            )
-        # to do maybe replace weights
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        if first_RUN: print("self.norm(x)", x.size())
-        return x
-
-
-class Attention(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 attn_drop=0.,
-                 proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[
-            2]  # make torchscript happy (cannot use tensor as tuple)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(dim,
-                              num_heads=num_heads,
-                              qkv_bias=qkv_bias,
-                              attn_drop=attn_drop,
-                              proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
-
-    def forward(self, x):
-        x = x + self.drop_path(self.attn(self.norm1(x)))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class PaSST(nn.Module):
-    """
-
-    Based on the implementation of Vision Transformer in timm library.
-     Take a look at the get_model function, adapting the weights of pretrained imagenet models.
-
-    """
-
-    def __init__(self,
-                 u_patchout=0,
-                 s_patchout_t=0,
-                 s_patchout_f=0,
-                 img_size=(128, 998),
-                 patch_size=16,
-                 stride=16,
-                 in_chans=1,
-                 num_classes=527,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 representation_size=None,
-                 distilled=False,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 embed_layer=PatchEmbed,
-                 norm_layer=None,
-                 act_layer=None,
-                 weight_init=''):
-        """
-        Args:
-            u_patchout: Unstructured Patchout integer, number of items to be removed from the final sequence
-            s_patchout_t: structured Patchout time integer, number of columns to be removed from the patches grid
-            s_patchout_f: structured Patchout Frequency integer, number of rows to be removed from the patches grid
-            img_size (int, tuple): input image size
-            patch_size (int, tuple): patch size
-            in_chans (int): number of input channels
-            num_classes (int): number of classes for classification head
-            embed_dim (int): embedding dimension
-            depth (int): depth of transformer
-            num_heads (int): number of attention heads
-            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
-            qkv_bias (bool): enable bias for qkv if True
-            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
-            distilled (bool): model includes a distillation token and head as in DeiT models
-            drop_rate (float): dropout rate
-            attn_drop_rate (float): attention dropout rate
-            drop_path_rate (float): stochastic depth rate
-            embed_layer (nn.Module): patch embedding layer
-            norm_layer: (nn.Module): normalization layer
-            weight_init: (str): weight init scheme
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.u_patchout = u_patchout
-        self.s_patchout_t = s_patchout_t
-        self.s_patchout_f = s_patchout_f
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-        self.num_tokens = 2 if distilled else 1
-        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
-        act_layer = act_layer or nn.GELU
-
-        self.patch_embed = embed_layer(img_size=img_size,
-                                       patch_size=patch_size,
-                                       stride=stride,
-                                       in_chans=in_chans,
-                                       embed_dim=embed_dim,
-                                       flatten=False)
-        num_patches = self.patch_embed.num_patches
-
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        self.dist_token = nn.Parameter(torch.zeros(
-            1, 1, embed_dim)) if distilled else None
-        # PaSST
-        # refer to https://arxiv.org/abs/2110.05069 Section 2
-        self.new_pos_embed = nn.Parameter(
-            torch.zeros(1, self.num_tokens, embed_dim))  # for C and D tokens
-        self.freq_new_pos_embed = nn.Parameter(
-            torch.zeros(1, embed_dim, self.patch_embed.grid_size[0], 1))  # | f
-        self.time_new_pos_embed = nn.Parameter(
-            torch.zeros(1, embed_dim, 1,
-                        self.patch_embed.grid_size[1]))  # __ t
-        ####
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
-               ]  # stochastic depth decay rule
-        self.blocks = nn.Sequential(*[
-            Block(dim=embed_dim,
-                  num_heads=num_heads,
-                  mlp_ratio=mlp_ratio,
-                  qkv_bias=qkv_bias,
-                  drop=drop_rate,
-                  attn_drop=attn_drop_rate,
-                  drop_path=dpr[i],
-                  norm_layer=norm_layer,
-                  act_layer=act_layer) for i in range(depth)
-        ])
-        self.norm = norm_layer(embed_dim)
-
-        # Representation layer
-        if representation_size and not distilled:
-            self.num_features = representation_size
-            self.pre_logits = nn.Sequential(
-                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
-                             ('act', nn.Tanh())]))
-        else:
-            self.pre_logits = nn.Identity()
-
-        # Classifier head(s)
-        self.head = nn.Sequential(
-            nn.LayerNorm(self.num_features),
-            nn.Linear(self.num_features, num_classes)
-            if num_classes > 0 else nn.Identity())
-        self.head_dist = None
-        if distilled:
-            self.head_dist = nn.Linear(
-                self.embed_dim,
-                self.num_classes) if num_classes > 0 else nn.Identity()
-
-        self.init_weights(weight_init)
-
-    def init_weights(self, mode=''):
-        assert mode in ('jax', 'jax_nlhb', 'nlhb', '')
-        head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
-        trunc_normal_(self.new_pos_embed, std=.02)
-        trunc_normal_(self.freq_new_pos_embed, std=.02)
-        trunc_normal_(self.time_new_pos_embed, std=.02)
-        if self.dist_token is not None:
-            trunc_normal_(self.dist_token, std=.02)
-        if mode.startswith('jax'):
-            # leave cls token as zeros to match jax impl
-            raise RuntimeError("Not supported yet")
-        else:
-            trunc_normal_(self.cls_token, std=.02)
-            self.apply(_init_vit_weights)
-
-    def _init_weights(self, m):
-        # this fn left here for compat with downstream users
-        _init_vit_weights(m)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {
-            'new_pos_embed', 'freq_new_pos_embed', 'time_new_pos_embed',
-            'cls_token', 'dist_token'
-        }
-
-    def get_classifier(self):
-        if self.dist_token is None:
-            return self.head
-        else:
-            return self.head, self.head_dist
-
-    def reset_classifier(self, num_classes, global_pool=''):
-        self.num_classes = num_classes
-        self.head = nn.Linear(
-            self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        if self.num_tokens == 2:
-            self.head_dist = nn.Linear(
-                self.embed_dim,
-                self.num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        global first_RUN  # not jit friendly? use trace instead
-        x = self.patch_embed(x)  # [b, e, f, t]
-        B_dim, E_dim, F_dim, T_dim = x.shape  # slow
-        if first_RUN: print(" patch_embed : ", x.shape)
-        # Adding Time/Freq information
-        if first_RUN:
-            print(" self.time_new_pos_embed.shape",
-                  self.time_new_pos_embed.shape)
-        time_new_pos_embed = self.time_new_pos_embed
-        if x.shape[-1] < time_new_pos_embed.shape[-1]:
-            if self.training:
-                toffset = torch.randint(
-                    1 + time_new_pos_embed.shape[-1] - x.shape[-1],
-                    (1, )).item()
-                if first_RUN:
-                    print(
-                        f" CUT with randomoffset={toffset} time_new_pos_embed.shape",
-                        time_new_pos_embed.shape)
-                time_new_pos_embed = time_new_pos_embed[:, :, :,
-                                                        toffset:toffset +
-                                                        x.shape[-1]]
-            else:
-                time_new_pos_embed = time_new_pos_embed[:, :, :, :x.shape[-1]]
-            if first_RUN:
-                print(" CUT time_new_pos_embed.shape",
-                      time_new_pos_embed.shape)
-        else:
-            warnings.warn(
-                f"the patches shape:{x.shape} are larger than the expected time encodings {time_new_pos_embed.shape}, x will be cut"
-            )
-            x = x[:, :, :, :time_new_pos_embed.shape[-1]]
-        x = x + time_new_pos_embed
-        if first_RUN:
-            print(" self.freq_new_pos_embed.shape",
-                  self.freq_new_pos_embed.shape)
-        x = x + self.freq_new_pos_embed
-
-        # Structured Patchout https://arxiv.org/abs/2110.05069 Section 2.2
-        if self.s_patchout_t:
-            if first_RUN:
-                print(f"X Before time Patchout of {self.s_patchout_t} ",
-                      x.size())
-            # ([1, 768, 1, 82])
-            random_indices = torch.randperm(
-                T_dim)[:T_dim - self.s_patchout_t].sort().values
-            x = x[:, :, :, random_indices]
-            if first_RUN: print("X after time Patchout", x.size())
-        if self.s_patchout_f:
-            if first_RUN:
-                print(f"X Before Freq Patchout of {self.s_patchout_f} ",
-                      x.size())
-            # [1, 768, 12, 1]
-            random_indices = torch.randperm(
-                F_dim)[:F_dim - self.s_patchout_f].sort().values
-            x = x[:, :, random_indices, :]
-            if first_RUN: print(" \n X after freq Patchout: ", x.size())
-        ###
-        # Flatten the sequence
-        x = x.flatten(2).transpose(1, 2)
-        # Unstructured Patchout
-        if first_RUN: print("X flattened", x.size())
-        if self.training and self.u_patchout:
-            seq_len = x.shape[1]
-            random_indices = torch.randperm(
-                seq_len)[:seq_len - self.u_patchout].sort().values
-            x = x[:, random_indices, :]
-            if first_RUN: print("X After Unstructured Patchout", x.size())
-        ####
-        # Add the C/D tokens
-        if first_RUN:
-            print(" self.new_pos_embed.shape", self.new_pos_embed.shape)
-        cls_tokens = self.cls_token.expand(B_dim, -1,
-                                           -1) + self.new_pos_embed[:, :1, :]
-        if first_RUN: print(" self.cls_tokens.shape", cls_tokens.shape)
-        if self.dist_token is None:
-            x = torch.cat((cls_tokens, x), dim=1)
-        else:
-            dist_token = self.dist_token.expand(
-                B_dim, -1, -1) + self.new_pos_embed[:, 1:, :]
-            if first_RUN: print(" self.dist_token.shape", dist_token.shape)
-            x = torch.cat((cls_tokens, dist_token, x), dim=1)
-
-        if first_RUN: print(" final sequence x", x.shape)
-        x = self.pos_drop(x)
-        x = self.blocks(x)
-        if first_RUN:
-            print(f" after {len(self.blocks)} atten blocks x", x.shape)
-        x = self.norm(x)
-
-        return x[:, 1:, :]
-
-    def forward(self, x):
-        global first_RUN
-        if first_RUN: print("x", x.size())
-
-        features = self.forward_features(x)
-        # first_RUN = False
-
-        # if self.head_dist is not None:
-        #     features = (x[0] + x[1]) / 2
-        #     if first_RUN: print("forward_features", features.size())
-        #     x = self.head(features)
-        #     if first_RUN: print("head", x.size())
-        #     first_RUN = False
-        #     return x, features
-        # else:
-        #     features = x
-        #     if first_RUN: print("forward_features", features.size())
-        #     x = self.head(x)
-        # if first_RUN: print("head", x.size())
-        # first_RUN = False
-        return features
-
-
-def _init_vit_weights(module: nn.Module,
-                      name: str = '',
-                      head_bias: float = 0.,
-                      jax_impl: bool = False):
-    """ ViT weight initialization
-    * When called without n, head_bias, jax_impl args it will behave exactly the same
-      as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
-    * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
-    """
-    if isinstance(module, nn.Linear):
-        if name.startswith('head'):
-            nn.init.zeros_(module.weight)
-            nn.init.constant_(module.bias, head_bias)
-        elif name.startswith('pre_logits'):
-            lecun_normal_(module.weight)
-            nn.init.zeros_(module.bias)
-        else:
-            if jax_impl:
-                nn.init.xavier_uniform_(module.weight)
-                if module.bias is not None:
-                    if 'mlp' in name:
-                        nn.init.normal_(module.bias, std=1e-6)
-                    else:
-                        nn.init.zeros_(module.bias)
-            else:
-                trunc_normal_(module.weight, std=.02)
-                if module.bias is not None:
-                    nn.init.zeros_(module.bias)
-    elif jax_impl and isinstance(module, nn.Conv2d):
-        # NOTE conv was left to pytorch default in my original init
-        lecun_normal_(module.weight)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2d)):
-        nn.init.zeros_(module.bias)
-        nn.init.ones_(module.weight)
-
-
-def resize_pos_embed(posemb,
-                     posemb_new,
-                     num_tokens=1,
-                     gs_new=(),
-                     mode='bicubic'):
-    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
-    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
-    _logger.info('Resized position embedding: %s to %s with %s cls/dis tokens',
-                 posemb.shape, posemb_new.shape, num_tokens)
-    ntok_new = posemb_new.shape[1]
-    if num_tokens:
-        posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0,
-                                                                 num_tokens:]
-        ntok_new -= num_tokens
-    else:
-        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
-    gs_old = int(math.sqrt(len(posemb_grid)))
-    if not len(gs_new):  # backwards compatibility
-        gs_new = [int(math.sqrt(ntok_new))] * 2
-    assert len(gs_new) >= 2
-    _logger.info('Position embedding grid-size from %s to %s',
-                 [gs_old, gs_old], gs_new)
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
-                                      -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid,
-                                size=gs_new,
-                                mode=mode,
-                                align_corners=False)
-    posemb_grid = posemb_grid.permute(0, 2, 3,
-                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
-    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
-    return posemb
-
-
-def adapt_image_pos_embed_to_passt(posemb,
-                                   num_tokens=1,
-                                   gs_new=(),
-                                   mode='bicubic'):
-    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
-    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
-    _logger.info('Resized position embedding: %s to %s with %s cls/dis tokens',
-                 posemb.shape, gs_new, num_tokens)
-    if num_tokens:
-        posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0,
-                                                                 num_tokens:]
-    else:
-        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
-    gs_old = int(math.sqrt(len(posemb_grid)))
-
-    assert len(gs_new) >= 2
-    _logger.info('Position embedding grid-size from %s to %s',
-                 [gs_old, gs_old], gs_new)
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
-                                      -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid,
-                                size=gs_new,
-                                mode=mode,
-                                align_corners=False)
-    freq_new_pos_embed = posemb_grid.mean(dim=3, keepdim=True)
-    time_new_pos_embed = posemb_grid.mean(dim=2, keepdim=True)
-    _logger.info('New Position cls/dstl embedding %s', posemb_tok.shape)
-    _logger.info('New FREQ Position embedding %s', freq_new_pos_embed.shape)
-    _logger.info('New TIME Position embedding %s', time_new_pos_embed.shape)
-    return posemb_tok, freq_new_pos_embed, time_new_pos_embed
-
-
-def checkpoint_filter_fn(state_dict, model):
-    """ convert patch embedding weight from manual patchify + linear proj to conv"""
-    out_dict = {}
-    if 'model' in state_dict:
-        # For deit models
-        state_dict = state_dict['model']
-    state_dict = {k: v for k, v in state_dict.items()}
-    if "time_new_pos_embed" not in state_dict:
-        # we are working with ImageNet model
-        _logger.info(
-            "Adapting pos embedding from ImageNet pretrained model to PaSST.")
-        v = state_dict.pop("pos_embed")
-        new_pos_embed, freq_new_pos_embed, time_new_pos_embed = adapt_image_pos_embed_to_passt(
-            v, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
-        state_dict["new_pos_embed"] = new_pos_embed
-        state_dict["freq_new_pos_embed"] = freq_new_pos_embed
-        state_dict["time_new_pos_embed"] = time_new_pos_embed
-
-    for k, v in state_dict.items():
-        if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
-            # For old models that I trained prior to conv based patchification
-            O, I, H, W = model.patch_embed.proj.weight.shape
-            v = v.reshape(O, -1, H, W)
-        elif k == 'pos_embed' and v.shape != model.pos_embed.shape:
-            # this should never occur
-            v = resize_pos_embed(v, model.pos_embed,
-                                 getattr(model, 'num_tokens', 1),
-                                 model.patch_embed.grid_size)
-        out_dict[k] = v
-    return out_dict
-
-
-def _create_vision_transformer(variant,
-                               pretrained=False,
-                               default_cfg=None,
-                               **kwargs):
-    default_cfg = default_cfg or default_cfgs[variant]
-    if kwargs.get('features_only', None):
-        raise RuntimeError(
-            'features_only not implemented for Vision Transformer models.')
-
-    # NOTE this extra code to support handling of repr size for in21k pretrained models
-    default_num_classes = default_cfg['num_classes']
-    num_classes = kwargs.get('num_classes', default_num_classes)
-    repr_size = kwargs.pop('representation_size', None)
-    if repr_size is not None and num_classes != default_num_classes:
-        # Remove representation layer if fine-tuning. This may not always be the desired action,
-        # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
-        _logger.warning("Removing representation layer for fine-tuning.")
-        repr_size = None
-
-    del default_cfg['url']
-    default_cfg[
-        'file'] = 'src/modeling/passt/passt-s-f128-p16-s10-ap.476-swa.pt'
-    model = build_model_with_cfg(PaSST,
-                                 variant,
-                                 pretrained,
-                                 default_cfg=default_cfg,
-                                 representation_size=repr_size,
-                                 pretrained_filter_fn=checkpoint_filter_fn,
-                                 pretrained_custom_load=False,
-                                 **kwargs)
-    return model
-
-
-def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    NOTE: this model has a representation layer but the 21k classifier head is zero'd out in original weights
-    """
-    model_kwargs = dict(patch_size=14,
-                        embed_dim=1280,
-                        depth=32,
-                        num_heads=16,
-                        representation_size=1280,
-                        **kwargs)
-    model = _create_vision_transformer('vit_huge_patch14_224_in21k',
-                                       pretrained=pretrained,
-                                       **model_kwargs)
-    return model
-
-
-def deit_base_distilled_patch16_384(pretrained=False, **kwargs):
-    """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    print("\n\n Loading DEIT BASE 384\n\n")
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    model = _create_vision_transformer('deit_base_distilled_patch16_384',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_swa_p16_128_ap476(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 10 structured patchout mAP=476 SWA \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (10, 10):
-        warnings.warn(
-            f"This model was pre-trained with strides {(10, 10)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_swa_p16_128_ap476',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_swa_p16_128_ap4761(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 10 structured patchout mAP=4763 SWA \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (10, 10):
-        warnings.warn(
-            f"This model was pre-trained with strides {(10, 10)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_swa_p16_128_ap4761',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_p16_128_ap472(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 10 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (10, 10):
-        warnings.warn(
-            f"This model was pre-trained with strides {(10, 10)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_p16_128_ap472',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_p16_s12_128_ap470(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 12 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (12, 12):
-        warnings.warn(
-            f"This model was pre-trained with strides {(12, 12)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_p16_s12_128_ap470',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_f128_20sec_p16_s10_ap474_swa(pretrained=False, **kwargs):
-    print(
-        "\n\n Loading PASST TRAINED ON AUDISET with 20 Second time encodings, with STFT hop of 160 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    model = _create_vision_transformer('passt-s-f128-20sec-p16-s10-ap474-swa',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_f128_30sec_p16_s10_ap473_swa(pretrained=False, **kwargs):
-    print(
-        "\n\n Loading PASST TRAINED ON AUDISET with 30 Second time encodings, with STFT hop of 160 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    model = _create_vision_transformer('passt-s-f128-30sec-p16-s10-ap473-swa',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_swa_p16_s12_128_ap473(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 12 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (12, 12):
-        warnings.warn(
-            f"This model was pre-trained with strides {(12, 12)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_swa_p16_s12_128_ap473',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_p16_s14_128_ap469(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 14 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (14, 14):
-        warnings.warn(
-            f"This model was pre-trained with strides {(14, 14)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_p16_s14_128_ap469',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_swa_p16_s14_128_ap471(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 14 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (14, 14):
-        warnings.warn(
-            f"This model was pre-trained with strides {(14, 14)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_swa_p16_s14_128_ap471',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_swa_p16_s16_128_ap473(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 16 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (16, 16):
-        warnings.warn(
-            f"This model was pre-trained with strides {(16, 16)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_swa_p16_s16_128_ap473',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def passt_s_p16_s16_128_ap468(pretrained=False, **kwargs):
-    """ PaSST pre-trained on AudioSet
-    """
-    print(
-        "\n\n Loading PaSST pre-trained on AudioSet Patch 16 stride 16 structured patchout mAP=472 \n\n"
-    )
-    model_kwargs = dict(patch_size=16,
-                        embed_dim=768,
-                        depth=12,
-                        num_heads=12,
-                        **kwargs)
-    if model_kwargs.get("stride") != (16, 16):
-        warnings.warn(
-            f"This model was pre-trained with strides {(16, 16)}, but now you set (fstride,tstride) to {model_kwargs.get('stride')}."
-        )
-    model = _create_vision_transformer('passt_s_p16_s16_128_ap468',
-                                       pretrained=pretrained,
-                                       distilled=True,
-                                       **model_kwargs)
-    return model
-
-
-def fix_embedding_layer(model, embed="default"):
-    if embed == "default":
-        return model
-    if embed == "overlap":
-        model.patch_embed = PatchEmbedAdaptiveMean(replace=model.patch_embed)
-    if embed == "am_keepconv":
-        model.patch_embed = PatchEmbedAdaptiveMeanKeepConv(
-            replace=model.patch_embed)
-    return model
-
-
-def lighten_model(model, cut_depth=0):
-    if cut_depth == 0:
-        return model
-    if cut_depth:
-        if cut_depth < 0:
-            print(
-                f"\n Reducing model depth by removing every  {-cut_depth} layer \n\n"
-            )
-        else:
-            print(f"\n Reducing model depth by {cut_depth} \n\n")
-            if len(model.blocks) < cut_depth + 2:
-                raise ValueError(
-                    f"Cut depth a VIT with {len(model.blocks)} "
-                    f"layers should be between 1 and {len(model.blocks) - 2}")
-        print(f"\n Before Cutting it was  {len(model.blocks)} \n\n")
-
-        old_blocks = list(model.blocks.children())
-        if cut_depth < 0:
-            print(f"cut_depth={cut_depth}")
-            old_blocks = [old_blocks[0]
-                          ] + old_blocks[1:-1:-cut_depth] + [old_blocks[-1]]
-        else:
-            old_blocks = [old_blocks[0]] + old_blocks[cut_depth + 1:]
-        model.blocks = nn.Sequential(*old_blocks)
-        print(f"\n Atfer Cutting it is  {len(model.blocks)} \n\n")
-    return model
-
-
-def get_model(
-    arch="passt_s_swa_p16_128_ap476",
-    pretrained=True,
-    n_classes=527,
-    in_channels=1,
-    fstride=10,
-    tstride=10,
-    input_fdim=128,
-    input_tdim=998,
-    u_patchout=0,
-    s_patchout_t=0,
-    s_patchout_f=0,
-):
-    """
-    :param arch: Base ViT or Deit architecture
-    :param pretrained: use pretrained model on imagenet
-    :param n_classes: number of classes
-    :param in_channels: number of input channels: 1 for mono
-    :param fstride: the patches stride over frequency.
-    :param tstride: the patches stride over time.
-    :param input_fdim: the expected input frequency bins.
-    :param input_tdim: the expected input time bins.
-    :param u_patchout: number of input patches to drop in Unstructured Patchout as defined in https://arxiv.org/abs/2110.05069
-    :param s_patchout_t: number of input time frames to drop Structured Patchout as defined in https://arxiv.org/abs/2110.05069
-    :param s_patchout_f:  number of input frequency bins to drop Structured Patchout as defined in https://arxiv.org/abs/2110.05069
-    :param audioset_pretrain: use pretrained models on Audioset.
-    :return:
-
-    """
-    model_func = None
-    input_size = (input_fdim, input_tdim)
-    stride = (fstride, tstride)
-    if arch == "passt_deit_bd_p16_384":  # base deit
-        model_func = deit_base_distilled_patch16_384
-    elif arch == "passt_s_swa_p16_128_ap476":  # pretrained
-        model_func = passt_s_swa_p16_128_ap476
-    elif arch == "passt_s_swa_p16_128_ap4761":
-        model_func = passt_s_swa_p16_128_ap4761
-    elif arch == "passt_s_p16_128_ap472":
-        model_func = passt_s_p16_128_ap472
-    elif arch == "passt_s_p16_s16_128_ap468":
-        model_func = passt_s_p16_s16_128_ap468
-    elif arch == "passt_s_swa_p16_s16_128_ap473":
-        model_func = passt_s_swa_p16_s16_128_ap473
-    elif arch == "passt_s_swa_p16_s14_128_ap471":
-        model_func = passt_s_swa_p16_s14_128_ap471
-    elif arch == "passt_s_p16_s14_128_ap469":
-        model_func = passt_s_p16_s14_128_ap469
-    elif arch == "passt_s_swa_p16_s12_128_ap473":
-        model_func = passt_s_swa_p16_s12_128_ap473
-    elif arch == "passt_s_p16_s12_128_ap470":
-        model_func = passt_s_p16_s12_128_ap470
-    elif arch == "passt_s_f128_20sec_p16_s10_ap474":
-        model_func = passt_s_f128_20sec_p16_s10_ap474_swa
-    elif arch == "passt_s_f128_30sec_p16_s10_ap473":
-        model_func = passt_s_f128_30sec_p16_s10_ap473_swa
-
-    if model_func is None:
-        raise RuntimeError(f"Unknown model {arch}")
-    model = model_func(pretrained=pretrained,
-                       num_classes=n_classes,
-                       in_chans=in_channels,
-                       img_size=input_size,
-                       stride=stride,
-                       u_patchout=u_patchout,
-                       s_patchout_t=s_patchout_t,
-                       s_patchout_f=s_patchout_f)
-    model = fix_embedding_layer(model)
-    model = lighten_model(model)
-    # print(model)
-    return model
-
-
-class EnsembelerModel(nn.Module):
-
-    def __init__(self, models):
-        super(EnsembelerModel, self).__init__()
-        self.models = nn.ModuleList(models)
-
-    def forward(self, x):
-        # ModuleList can act as an iterable, or be indexed using ints
-        all_out = None
-        for i, m in enumerate(self.models):
-            out, _ = m(x)
-            if all_out is None:
-                all_out = out
-            else:
-                all_out = out + all_out
-        all_out = all_out / len(self.models)
-        return all_out, all_out
-
-
-def get_ensemble_model(arch_list=[]):
-    # arch_list = [(passt_s_swa_p16_128_ap476,fstride,tstride)]
-    models_list = [
-        get_model(arch=arch, fstride=fstride, tstride=tstride)
-        for arch, fstride, tstride in arch_list
-    ]
-    model = EnsembelerModel(models_list)
-    # print(model)
-    return model
diff --git a/AVLFormer/src/modeling/passt/preprocess.py b/AVLFormer/src/modeling/passt/preprocess.py
deleted file mode 100644
index 433064e..0000000
--- a/AVLFormer/src/modeling/passt/preprocess.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.nn as nn
-import torchaudio
-
-sz_float = 4  # size of a float
-epsilon = 10e-8  # fudge factor for normalization
-
-
-class AugmentMelSTFT(nn.Module):
-
-    def __init__(self,
-                 n_mels=128,
-                 sr=32000,
-                 win_length=800,
-                 hopsize=320,
-                 n_fft=1024,
-                 freqm=48,
-                 timem=192,
-                 htk=False,
-                 fmin=0.0,
-                 fmax=None,
-                 norm=1,
-                 fmin_aug_range=1,
-                 fmax_aug_range=1000):
-        torch.nn.Module.__init__(self)
-        # adapted from: https://github.com/CPJKU/kagglebirds2020/commit/70f8308b39011b09d41eb0f4ace5aa7d2b0e806e
-        # Similar config to the spectrograms used in AST: https://github.com/YuanGongND/ast
-
-        self.win_length = win_length
-        self.n_mels = n_mels
-        self.n_fft = n_fft
-        self.sr = sr
-        self.htk = htk
-        self.fmin = fmin
-        if fmax is None:
-            fmax = sr // 2 - fmax_aug_range // 2
-            print(f"Warning: FMAX is None setting to {fmax} ")
-        self.fmax = fmax
-        self.norm = norm
-        self.hopsize = hopsize
-        self.register_buffer('window',
-                             torch.hann_window(win_length, periodic=False),
-                             persistent=False)
-        assert fmin_aug_range >= 1, f"fmin_aug_range={fmin_aug_range} should be >=1; 1 means no augmentation"
-        assert fmin_aug_range >= 1, f"fmax_aug_range={fmax_aug_range} should be >=1; 1 means no augmentation"
-        self.fmin_aug_range = fmin_aug_range
-        self.fmax_aug_range = fmax_aug_range
-
-        self.register_buffer("preemphasis_coefficient",
-                             torch.as_tensor([[[-.97, 1]]]),
-                             persistent=False)
-        if freqm == 0:
-            self.freqm = torch.nn.Identity()
-        else:
-            self.freqm = torchaudio.transforms.FrequencyMasking(freqm,
-                                                                iid_masks=True)
-        if timem == 0:
-            self.timem = torch.nn.Identity()
-        else:
-            self.timem = torchaudio.transforms.TimeMasking(timem,
-                                                           iid_masks=True)
-
-    def forward(self, x):
-
-        x = nn.functional.conv1d(x.unsqueeze(1),
-                                 self.preemphasis_coefficient).squeeze(1)
-        x = torch.stft(x,
-                       self.n_fft,
-                       hop_length=self.hopsize,
-                       win_length=self.win_length,
-                       center=True,
-                       normalized=False,
-                       window=self.window,
-                       return_complex=False)
-        x = (x**2).sum(dim=-1)  # power mag
-        fmin = self.fmin + torch.randint(self.fmin_aug_range, (1, )).item()
-        fmax = self.fmax + self.fmax_aug_range // 2 - torch.randint(
-            self.fmax_aug_range, (1, )).item()
-        # don't augment eval data
-        if not self.training:
-            fmin = self.fmin
-            fmax = self.fmax
-
-        mel_basis, _ = torchaudio.compliance.kaldi.get_mel_banks(
-            self.n_mels,
-            self.n_fft,
-            self.sr,
-            fmin,
-            fmax,
-            vtln_low=100.0,
-            vtln_high=-500.,
-            vtln_warp_factor=1.0)
-        mel_basis = torch.as_tensor(torch.nn.functional.pad(mel_basis, (0, 1),
-                                                            mode='constant',
-                                                            value=0),
-                                    device=x.device)
-        with torch.cuda.amp.autocast(enabled=False):
-            melspec = torch.matmul(mel_basis, x)
-
-        melspec = (melspec + 0.00001).log()
-
-        if self.training:
-            melspec = self.freqm(melspec)
-            melspec = self.timem(melspec)
-
-        melspec = (melspec + 4.5) / 5.  # fast normalization
-
-        return melspec
-
-    def extra_repr(self):
-        return 'winsize={}, hopsize={}'.format(self.win_length, self.hopsize)
diff --git a/AVLFormer/src/modeling/passt/version b/AVLFormer/src/modeling/passt/version
deleted file mode 100644
index a566604..0000000
--- a/AVLFormer/src/modeling/passt/version
+++ /dev/null
@@ -1 +0,0 @@
-Oct, 2022
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/__init__.py b/AVLFormer/src/modeling/swin/__init__.py
deleted file mode 100644
index 59774f7..0000000
--- a/AVLFormer/src/modeling/swin/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .build import build_model
diff --git a/AVLFormer/src/modeling/swin/build.py b/AVLFormer/src/modeling/swin/build.py
deleted file mode 100644
index 6f84dc7..0000000
--- a/AVLFormer/src/modeling/swin/build.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# --------------------------------------------------------
-# Swin Transformer
-# Copyright (c) 2021 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ze Liu
-# --------------------------------------------------------
-
-from .swin_transformer import SwinTransformer
-
-def build_model(config):
-    model_type = config.MODEL.TYPE
-    if model_type == 'swin':
-        model = SwinTransformer(img_size=config.DATA.IMG_SIZE,
-                                patch_size=config.MODEL.SWIN.PATCH_SIZE,
-                                in_chans=config.MODEL.SWIN.IN_CHANS,
-                                num_classes=config.MODEL.NUM_CLASSES,
-                                embed_dim=config.MODEL.SWIN.EMBED_DIM,
-                                depths=config.MODEL.SWIN.DEPTHS,
-                                num_heads=config.MODEL.SWIN.NUM_HEADS,
-                                window_size=config.MODEL.SWIN.WINDOW_SIZE,
-                                mlp_ratio=config.MODEL.SWIN.MLP_RATIO,
-                                qkv_bias=config.MODEL.SWIN.QKV_BIAS,
-                                qk_scale=config.MODEL.SWIN.QK_SCALE,
-                                drop_rate=config.MODEL.DROP_RATE,
-                                drop_path_rate=config.MODEL.DROP_PATH_RATE,
-                                ape=config.MODEL.SWIN.APE,
-                                patch_norm=config.MODEL.SWIN.PATCH_NORM,
-                                use_checkpoint=config.TRAIN.USE_CHECKPOINT)
-    else:
-        raise NotImplementedError(f"Unkown model: {model_type}")
-
-    return model
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/config.py b/AVLFormer/src/modeling/swin/config.py
deleted file mode 100644
index 0449f6f..0000000
--- a/AVLFormer/src/modeling/swin/config.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# --------------------------------------------------------
-# Swin Transformer
-# Copyright (c) 2021 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ze Liu
-# --------------------------------------------------------'
-
-import os
-import yaml
-from yacs.config import CfgNode as CN
-
-_C = CN()
-
-# Base config files
-_C.BASE = ['']
-
-# -----------------------------------------------------------------------------
-# Data settings
-# -----------------------------------------------------------------------------
-_C.DATA = CN()
-# Batch size for a single GPU, could be overwritten by command line argument
-_C.DATA.BATCH_SIZE = 128
-# Path to dataset, could be overwritten by command line argument
-_C.DATA.DATA_PATH = ''
-# Dataset name
-_C.DATA.DATASET = 'imagenet'
-# Input image size
-_C.DATA.IMG_SIZE = 224
-# Interpolation to resize image (random, bilinear, bicubic)
-_C.DATA.INTERPOLATION = 'bicubic'
-# Use zipped dataset instead of folder dataset
-# could be overwritten by command line argument
-_C.DATA.ZIP_MODE = False
-# Cache Data in Memory, could be overwritten by command line argument
-_C.DATA.CACHE_MODE = 'part'
-# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
-_C.DATA.PIN_MEMORY = True
-# Number of data loading threads
-_C.DATA.NUM_WORKERS = 8
-
-# -----------------------------------------------------------------------------
-# Model settings
-# -----------------------------------------------------------------------------
-_C.MODEL = CN()
-# Model type
-_C.MODEL.TYPE = 'swin'
-# Model name
-_C.MODEL.NAME = 'swin_tiny_patch4_window7_224'
-# Checkpoint to resume, could be overwritten by command line argument
-_C.MODEL.RESUME = ''
-# Number of classes, overwritten in data preparation
-_C.MODEL.NUM_CLASSES = 1000
-# Dropout rate
-_C.MODEL.DROP_RATE = 0.0
-# Drop path rate
-_C.MODEL.DROP_PATH_RATE = 0.1
-# Label Smoothing
-_C.MODEL.LABEL_SMOOTHING = 0.1
-
-# Swin Transformer parameters
-_C.MODEL.SWIN = CN()
-_C.MODEL.SWIN.PATCH_SIZE = 4
-_C.MODEL.SWIN.IN_CHANS = 3
-_C.MODEL.SWIN.EMBED_DIM = 96
-_C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
-_C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
-_C.MODEL.SWIN.WINDOW_SIZE = 7
-_C.MODEL.SWIN.MLP_RATIO = 4.
-_C.MODEL.SWIN.QKV_BIAS = True
-_C.MODEL.SWIN.QK_SCALE = None
-_C.MODEL.SWIN.APE = False
-_C.MODEL.SWIN.PATCH_NORM = True
-
-# -----------------------------------------------------------------------------
-# Training settings
-# -----------------------------------------------------------------------------
-_C.TRAIN = CN()
-_C.TRAIN.START_EPOCH = 0
-_C.TRAIN.EPOCHS = 300
-_C.TRAIN.WARMUP_EPOCHS = 20
-_C.TRAIN.WEIGHT_DECAY = 0.05
-_C.TRAIN.BASE_LR = 5e-4
-_C.TRAIN.WARMUP_LR = 5e-7
-_C.TRAIN.MIN_LR = 5e-6
-# Clip gradient norm
-_C.TRAIN.CLIP_GRAD = 5.0
-# Auto resume from latest checkpoint
-_C.TRAIN.AUTO_RESUME = True
-# Gradient accumulation steps
-# could be overwritten by command line argument
-_C.TRAIN.ACCUMULATION_STEPS = 0
-# Whether to use gradient checkpointing to save memory
-# could be overwritten by command line argument
-_C.TRAIN.USE_CHECKPOINT = False
-
-# LR scheduler
-_C.TRAIN.LR_SCHEDULER = CN()
-_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
-# Epoch interval to decay LR, used in StepLRScheduler
-_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
-# LR decay rate, used in StepLRScheduler
-_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
-
-# Optimizer
-_C.TRAIN.OPTIMIZER = CN()
-_C.TRAIN.OPTIMIZER.NAME = 'adamw'
-# Optimizer Epsilon
-_C.TRAIN.OPTIMIZER.EPS = 1e-8
-# Optimizer Betas
-_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
-# SGD momentum
-_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
-
-# -----------------------------------------------------------------------------
-# Augmentation settings
-# -----------------------------------------------------------------------------
-_C.AUG = CN()
-# Color jitter factor
-_C.AUG.COLOR_JITTER = 0.4
-# Use AutoAugment policy. "v0" or "original"
-_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
-# Random erase prob
-_C.AUG.REPROB = 0.25
-# Random erase mode
-_C.AUG.REMODE = 'pixel'
-# Random erase count
-_C.AUG.RECOUNT = 1
-# Mixup alpha, mixup enabled if > 0
-_C.AUG.MIXUP = 0.8
-# Cutmix alpha, cutmix enabled if > 0
-_C.AUG.CUTMIX = 1.0
-# Cutmix min/max ratio, overrides alpha and enables cutmix if set
-_C.AUG.CUTMIX_MINMAX = None
-# Probability of performing mixup or cutmix when either/both is enabled
-_C.AUG.MIXUP_PROB = 1.0
-# Probability of switching to cutmix when both mixup and cutmix enabled
-_C.AUG.MIXUP_SWITCH_PROB = 0.5
-# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
-_C.AUG.MIXUP_MODE = 'batch'
-
-# -----------------------------------------------------------------------------
-# Testing settings
-# -----------------------------------------------------------------------------
-_C.TEST = CN()
-# Whether to use center crop when testing
-_C.TEST.CROP = True
-
-# -----------------------------------------------------------------------------
-# Misc
-# -----------------------------------------------------------------------------
-# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2')
-# overwritten by command line argument
-_C.AMP_OPT_LEVEL = ''
-# Path to output folder, overwritten by command line argument
-_C.OUTPUT = ''
-# Tag of experiment, overwritten by command line argument
-_C.TAG = 'default'
-# Frequency to save checkpoint
-_C.SAVE_FREQ = 1
-# Frequency to logging info
-_C.PRINT_FREQ = 10
-# Fixed random seed
-_C.SEED = 0
-# Perform evaluation only, overwritten by command line argument
-_C.EVAL_MODE = False
-# Test throughput only, overwritten by command line argument
-_C.THROUGHPUT_MODE = False
-# local rank for DistributedDataParallel, given by command line argument
-_C.LOCAL_RANK = 0
-
-
-def _update_config_from_file(config, cfg_file):
-    config.defrost()
-    with open(cfg_file, 'r') as f:
-        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
-
-    for cfg in yaml_cfg.setdefault('BASE', ['']):
-        if cfg:
-            _update_config_from_file(
-                config, os.path.join(os.path.dirname(cfg_file), cfg)
-            )
-    print('=> merge config from {}'.format(cfg_file))
-    config.merge_from_file(cfg_file)
-    config.freeze()
-
-
-def update_config(config, yaml_file):
-    _update_config_from_file(config, yaml_file)
-
-    config.defrost()
-    # if args.opts:
-    #     config.merge_from_list(args.opts)
-
-    # # merge from specific arguments
-    # if args.batch_size:
-    #     config.DATA.BATCH_SIZE = args.batch_size
-    # if args.data_path:
-    #     config.DATA.DATA_PATH = args.data_path
-    # if args.zip:
-    #     config.DATA.ZIP_MODE = True
-    # if args.cache_mode:
-    #     config.DATA.CACHE_MODE = args.cache_mode
-    # if args.resume:
-    #     config.MODEL.RESUME = args.resume
-    # if args.accumulation_steps:
-    #     config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
-    # if args.use_checkpoint:
-    #     config.TRAIN.USE_CHECKPOINT = True
-    # if args.amp_opt_level:
-    #     config.AMP_OPT_LEVEL = args.amp_opt_level
-    # if args.output:
-    #     config.OUTPUT = args.output
-    # if args.tag:
-    #     config.TAG = args.tag
-    # if args.eval:
-    #     config.EVAL_MODE = True
-    # if args.throughput:
-    #     config.THROUGHPUT_MODE = True
-
-    # # set local rank for distributed training
-    # config.LOCAL_RANK = args.local_rank
-
-    # # output folder
-    # config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)
-
-    config.freeze()
-
-
-def get_config(yaml_file):
-    """Get a yacs CfgNode object with default values."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    config = _C.clone()
-    update_config(config, yaml_file)
-
-    return config
diff --git a/AVLFormer/src/modeling/swin/swin_base_patch4_window12_384.yaml b/AVLFormer/src/modeling/swin/swin_base_patch4_window12_384.yaml
deleted file mode 100644
index b54deb7..0000000
--- a/AVLFormer/src/modeling/swin/swin_base_patch4_window12_384.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# only for evaluation
-DATA:
-  IMG_SIZE: 384
-MODEL:
-  TYPE: swin
-  NAME: swin_base_patch4_window12_384
-  SWIN:
-    EMBED_DIM: 128
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 4, 8, 16, 32 ]
-    WINDOW_SIZE: 12
-TEST:
-  CROP: False
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224.yaml b/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224.yaml
deleted file mode 100644
index 422a898..0000000
--- a/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-MODEL:
-  TYPE: swin
-  NAME: swin_base_patch4_window7_224
-  DROP_PATH_RATE: 0.5
-  SWIN:
-    EMBED_DIM: 128
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 4, 8, 16, 32 ]
-    WINDOW_SIZE: 7
-  NUM_CLASSES: 1000
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224_22k.yaml b/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224_22k.yaml
deleted file mode 100644
index c3ab60c..0000000
--- a/AVLFormer/src/modeling/swin/swin_base_patch4_window7_224_22k.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-MODEL:
-  TYPE: swin
-  NAME: swin_base_patch4_window7_224
-  DROP_PATH_RATE: 0.5
-  SWIN:
-    EMBED_DIM: 128
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 4, 8, 16, 32 ]
-    WINDOW_SIZE: 7
-  NUM_CLASSES: 21841
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_large_patch4_window12_384.yaml b/AVLFormer/src/modeling/swin/swin_large_patch4_window12_384.yaml
deleted file mode 100644
index bacf5f6..0000000
--- a/AVLFormer/src/modeling/swin/swin_large_patch4_window12_384.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# only for evaluation
-DATA:
-  IMG_SIZE: 384
-MODEL:
-  TYPE: swin
-  NAME: swin_large_patch4_window12_384
-  SWIN:
-    EMBED_DIM: 192
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 6, 12, 24, 48 ]
-    WINDOW_SIZE: 12
-TEST:
-  CROP: False
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_large_patch4_window7_224.yaml b/AVLFormer/src/modeling/swin/swin_large_patch4_window7_224.yaml
deleted file mode 100644
index df8af4c..0000000
--- a/AVLFormer/src/modeling/swin/swin_large_patch4_window7_224.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# only for evaluation
-MODEL:
-  TYPE: swin
-  NAME: swin_large_patch4_window7_224
-  SWIN:
-    EMBED_DIM: 192
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 6, 12, 24, 48 ]
-    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_small_patch4_window7_224.yaml b/AVLFormer/src/modeling/swin/swin_small_patch4_window7_224.yaml
deleted file mode 100644
index 8f5c40f..0000000
--- a/AVLFormer/src/modeling/swin/swin_small_patch4_window7_224.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-MODEL:
-  TYPE: swin
-  NAME: swin_small_patch4_window7_224
-  DROP_PATH_RATE: 0.3
-  SWIN:
-    EMBED_DIM: 96
-    DEPTHS: [ 2, 2, 18, 2 ]
-    NUM_HEADS: [ 3, 6, 12, 24 ]
-    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_tiny_patch4_window7_224.yaml b/AVLFormer/src/modeling/swin/swin_tiny_patch4_window7_224.yaml
deleted file mode 100644
index 851c745..0000000
--- a/AVLFormer/src/modeling/swin/swin_tiny_patch4_window7_224.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-MODEL:
-  TYPE: swin
-  NAME: swin_tiny_patch4_window7_224
-  DROP_PATH_RATE: 0.2
-  SWIN:
-    EMBED_DIM: 96
-    DEPTHS: [ 2, 2, 6, 2 ]
-    NUM_HEADS: [ 3, 6, 12, 24 ]
-    WINDOW_SIZE: 7
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/swin/swin_transformer.py b/AVLFormer/src/modeling/swin/swin_transformer.py
deleted file mode 100644
index 9b82c17..0000000
--- a/AVLFormer/src/modeling/swin/swin_transformer.py
+++ /dev/null
@@ -1,613 +0,0 @@
-# --------------------------------------------------------
-# Swin Transformer
-# Copyright (c) 2021 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ze Liu
-# --------------------------------------------------------
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as checkpoint
-from src.timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        if self.shift_size > 0:
-            # calculate attention mask for SW-MSA
-            H, W = self.input_resolution
-            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-            h_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
-            w_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
-            cnt = 0
-            for h in h_slices:
-                for w in w_slices:
-                    img_mask[:, h, w, :] = cnt
-                    cnt += 1
-
-            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def forward(self, x):
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-
-class SwinTransformer(nn.Module):
-    r""" Swin Transformer
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        num_classes (int): Number of classes for classification head. Default: 1000
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
-                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
-                 use_checkpoint=False, **kwargs):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
-                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                                                 patches_resolution[1] // (2 ** i_layer)),
-                               depth=depths[i_layer],
-                               num_heads=num_heads[i_layer],
-                               window_size=window_size,
-                               mlp_ratio=self.mlp_ratio,
-                               qkv_bias=qkv_bias, qk_scale=qk_scale,
-                               drop=drop_rate, attn_drop=attn_drop_rate,
-                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                               norm_layer=norm_layer,
-                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                               use_checkpoint=use_checkpoint)
-            self.layers.append(layer)
-
-        self.norm = norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x = self.norm(x)  # B L C
-        x = self.avgpool(x.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-    def forward_global_gridfeat(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x = self.norm(x)  # B L C
-        xx = self.avgpool(x.transpose(1, 2))  # B C 1
-        xx = torch.flatten(xx, 1)
-        return xx, x
-
-    def forward_pyramid_feat(self, x):
-        """Forward function."""
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        outs = []
-        for layer in self.layers:
-            x = layer(x)
-            outs.append(x)
-
-        return tuple(outs)
-
-    def flops(self):
-        flops = 0
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
-        flops += self.num_features * self.num_classes
-        return flops
diff --git a/AVLFormer/src/modeling/video_captioning_e2e_vid_swin_bert.py b/AVLFormer/src/modeling/video_captioning_e2e_vid_swin_bert.py
deleted file mode 100644
index 3ebf5b5..0000000
--- a/AVLFormer/src/modeling/video_captioning_e2e_vid_swin_bert.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from fairscale.nn.misc import checkpoint_wrapper
-import torch
-
-
-class SimpleRMSNorm(torch.nn.Module):
-
-    def __init__(self, d, p=-1., eps=1e-6, bias=False):
-        """
-            Root Mean Square Layer Normalization
-        :param d: model size
-        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
-        :param eps:  epsilon value, default 1e-8
-        :param bias: whether use bias term for RMSNorm, disabled by
-            default because RMSNorm doesn't enforce re-centering invariance.
-        """
-        super(SimpleRMSNorm, self).__init__()
-        self.eps = eps
-        self.d = d
-
-    def forward(self, x):
-        norm_x = x.norm(2, dim=-1, keepdim=True)
-        d_x = self.d
-
-        rms_x = norm_x * d_x**(-1. / 2)
-        x_normed = x / (rms_x + self.eps)
-
-        return x_normed
-
-
-class VideoTransformer(torch.nn.Module):
-
-    def __init__(self, args, config, swin, transformer_encoder, passt):
-        super(VideoTransformer, self).__init__()
-        self.config = config
-        self.use_checkpoint = args.use_checkpoint and not args.freeze_backbone
-        if self.use_checkpoint:
-            self.swin = checkpoint_wrapper(swin, offload_to_cpu=True)
-        else:
-            self.swin = swin
-        self.trans_encoder = transformer_encoder
-        self.img_feature_dim = int(args.img_feature_dim)
-        self.use_grid_feat = args.grid_feat
-        self.latent_feat_size = self.swin.backbone.norm.normalized_shape[0]
-        self.fc = torch.nn.Linear(self.latent_feat_size, self.img_feature_dim)
-        self.compute_mask_on_the_fly = False  # deprecated
-        self.mask_prob = args.mask_prob
-        self.mask_token_id = -1
-        self.max_img_seq_length = args.max_img_seq_length
-        # learn soft attention mask
-        self.learn_mask_enabled = getattr(args, 'learn_mask_enabled', False)
-        self.sparse_mask_soft2hard = getattr(args, 'sparse_mask_soft2hard',
-                                             False)
-
-        # passt
-        self.passt = passt
-        self.norm = SimpleRMSNorm(d=self.img_feature_dim)
-
-        if self.learn_mask_enabled == True:
-            self.learn_vid_att = torch.nn.Embedding(
-                args.max_img_seq_length * args.max_img_seq_length, 1)
-            self.sigmoid = torch.nn.Sigmoid()
-
-    def forward(self, *args, **kwargs):
-        # input_ids = torch.Size([3, 300])
-        # attention_mask = torch.Size([3, 1084, 1084])
-        # token_type_ids = torch.Size([3, 300])
-        # img_feats = torch.Size([3, 32, 3, 224, 224])
-        # masked_pos = torch.Size([3, 300])
-        # masked_ids = torch.Size([3, 45])
-
-        audios = kwargs['audio_feat']
-        audios = self.norm(self.passt(audios))
-        del kwargs['audio_feat']
-
-        images = kwargs['img_feats']
-        B, S, C, H, W = images.shape  # batch, segment, chanel, hight, width
-        # (B x S x C x H x W) --> (B x C x S x H x W)
-        images = images.permute(0, 2, 1, 3, 4)
-        vid_feats = self.swin(images)
-        if self.use_grid_feat == True:
-            vid_feats = vid_feats.permute(0, 2, 3, 4, 1)
-        vid_feats = vid_feats.view(B, -1, self.latent_feat_size)
-        vid_feats = self.fc(vid_feats)
-
-        # concat vid + audio
-        vid_feats = torch.cat((vid_feats, audios),
-                              dim=-2)  # vid_feats = torch.Size([3, 784, 512])
-        # prepare VL transformer inputs
-        kwargs['img_feats'] = vid_feats
-
-        # self.trans_encoder.bert.encoder.output_attention = False
-        # if self.trans_encoder.bert.encoder.output_attentions:
-        #     self.trans_encoder.bert.encoder.set_output_attentions(False)
-
-        # learn soft attention mask
-        if self.learn_mask_enabled:
-            kwargs['attention_mask'] = kwargs['attention_mask'].float()
-            # max_img_seq_length = 784
-            vid_att_len = self.max_img_seq_length
-            learn_att = self.learn_vid_att.weight.reshape(
-                vid_att_len, vid_att_len)
-            learn_att = self.sigmoid(learn_att)
-            diag_mask = torch.diag(torch.ones(vid_att_len)).cuda()
-            video_attention = (1. - diag_mask) * learn_att
-            learn_att = diag_mask + video_attention
-            # sparse_mask_soft2hard = False
-            if self.sparse_mask_soft2hard:
-                learn_att = (learn_att >= 0.5) * 1.0
-                learn_att = learn_att.cuda()
-                learn_att.requires_grad = False
-            kwargs['attention_mask'][:, -vid_att_len::,
-                                     -vid_att_len::] = learn_att
-
-        outputs = self.trans_encoder(*args, **kwargs)
-        if self.learn_mask_enabled:
-            loss_sparsity = self.get_loss_sparsity(video_attention)
-            outputs = outputs + (loss_sparsity, )
-
-        return outputs  # outputs = (loss= NUM, logits= torch.Size([123, 30522]))
-
-    def get_loss_sparsity(self, video_attention):
-        sparsity_loss = 0
-        sparsity_loss += (torch.mean(torch.abs(video_attention)))
-        return sparsity_loss
-
-    def diag_based_init_attn_mask(self, pretrain_attn_mask):
-        import numpy
-        pretrained_num_tokens = int(numpy.sqrt(pretrain_attn_mask.shape[0]))
-
-        pretrained_learn_att = pretrain_attn_mask.reshape(
-            pretrained_num_tokens, pretrained_num_tokens)
-        zeros_mask = torch.zeros_like(pretrained_learn_att)
-        scale_factor = self.max_img_seq_length / pretrained_num_tokens
-
-        vid_att_len = self.max_img_seq_length
-        learn_att = self.learn_vid_att.weight.reshape(vid_att_len, vid_att_len)
-        with torch.no_grad():
-            for i in range(int(scale_factor)):
-                learn_att[pretrained_num_tokens * i:pretrained_num_tokens *
-                          (i + 1),
-                          pretrained_num_tokens * i:pretrained_num_tokens *
-                          (i + 1)] = pretrained_learn_att
-
-    def bilinear_init_attn_mask(self, pretrain_attn_mask):
-        print('init attn mask with bilinear interpolation')
-        import numpy
-        pretrained_num_tokens = int(numpy.sqrt(pretrain_attn_mask.shape[0]))
-
-        pretrained_learn_att = pretrain_attn_mask.reshape(
-            pretrained_num_tokens, pretrained_num_tokens)
-        vid_att_len = self.max_img_seq_length
-        learn_att = self.learn_vid_att.weight.reshape(vid_att_len, vid_att_len)
-        scale_factor = int(self.max_img_seq_length / pretrained_num_tokens)
-        sampler = torch.nn.Upsample(scale_factor=scale_factor, mode='bilinear')
-        with torch.no_grad():
-            learn_att = sampler(
-                pretrained_learn_att[None,
-                                     None, :, :].double())[0, 0, :, :].half()
-
-    def random_init_attn_mask(self):
-        print('random init attn mask')
-        self.learn_vid_att = torch.nn.Embedding(
-            self.max_img_seq_length * self.max_img_seq_length, 1)
-
-    def reload_attn_mask(self, pretrain_attn_mask):
-        import numpy
-        pretrained_num_tokens = int(numpy.sqrt(pretrain_attn_mask.shape[0]))
-
-        pretrained_learn_att = pretrain_attn_mask.reshape(
-            pretrained_num_tokens, pretrained_num_tokens)
-        scale_factor = 1
-        vid_att_len = self.max_img_seq_length
-        learn_att = self.learn_vid_att.weight.reshape(vid_att_len, vid_att_len)
-        with torch.no_grad():
-            for i in range(int(scale_factor)):
-                learn_att[pretrained_num_tokens * i:pretrained_num_tokens *
-                          (i + 1),
-                          pretrained_num_tokens * i:pretrained_num_tokens *
-                          (i + 1)] = pretrained_learn_att
-
-    def freeze_backbone(self, freeze=True):
-        for _, p in self.swin.named_parameters():
-            p.requires_grad = not freeze
diff --git a/AVLFormer/src/modeling/video_swin/config.py b/AVLFormer/src/modeling/video_swin/config.py
deleted file mode 100644
index 1226560..0000000
--- a/AVLFormer/src/modeling/video_swin/config.py
+++ /dev/null
@@ -1,721 +0,0 @@
-# Copyright (c) Open-MMLab. All rights reserved.
-import ast
-import copy
-import os
-import os.path as osp
-import platform
-import shutil
-import sys
-import tempfile
-import uuid
-import warnings
-from argparse import Action, ArgumentParser
-from collections import abc
-from importlib import import_module
-
-from addict import Dict
-from yapf.yapflib.yapf_api import FormatCode
-
-# from .misc import import_modules_from_strings
-# from .path import check_file_exist
-
-if platform.system() == 'Windows':
-    import regex as re
-else:
-    import re
-
-BASE_KEY = '_base_'
-DELETE_KEY = '_delete_'
-RESERVED_KEYS = ['filename', 'text', 'pretty_text']
-
-
-
-def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
-    if not osp.isfile(filename):
-        raise FileNotFoundError(msg_tmpl.format(filename))
-
-def import_modules_from_strings(imports, allow_failed_imports=False):
-    """Import modules from the given list of strings.
-    Args:
-        imports (list | str | None): The given module names to be imported.
-        allow_failed_imports (bool): If True, the failed imports will return
-            None. Otherwise, an ImportError is raise. Default: False.
-    Returns:
-        list[module] | module | None: The imported modules.
-    Examples:
-        >>> osp, sys = import_modules_from_strings(
-        ...     ['os.path', 'sys'])
-        >>> import os.path as osp_
-        >>> import sys as sys_
-        >>> assert osp == osp_
-        >>> assert sys == sys_
-    """
-    if not imports:
-        return
-    single_import = False
-    if isinstance(imports, str):
-        single_import = True
-        imports = [imports]
-    if not isinstance(imports, list):
-        raise TypeError(
-            f'custom_imports must be a list but got type {type(imports)}')
-    imported = []
-    for imp in imports:
-        if not isinstance(imp, str):
-            raise TypeError(
-                f'{imp} is of type {type(imp)} and cannot be imported.')
-        try:
-            imported_tmp = import_module(imp)
-        except ImportError:
-            if allow_failed_imports:
-                warnings.warn(f'{imp} failed to import and is ignored.',
-                              UserWarning)
-                imported_tmp = None
-            else:
-                raise ImportError
-        imported.append(imported_tmp)
-    if single_import:
-        imported = imported[0]
-    return imported
-
-
-class ConfigDict(Dict):
-
-    def __missing__(self, name):
-        raise KeyError(name)
-
-    def __getattr__(self, name):
-        try:
-            value = super(ConfigDict, self).__getattr__(name)
-        except KeyError:
-            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
-                                f"attribute '{name}'")
-        except Exception as e:
-            ex = e
-        else:
-            return value
-        raise ex
-
-
-def add_args(parser, cfg, prefix=''):
-    for k, v in cfg.items():
-        if isinstance(v, str):
-            parser.add_argument('--' + prefix + k)
-        elif isinstance(v, int):
-            parser.add_argument('--' + prefix + k, type=int)
-        elif isinstance(v, float):
-            parser.add_argument('--' + prefix + k, type=float)
-        elif isinstance(v, bool):
-            parser.add_argument('--' + prefix + k, action='store_true')
-        elif isinstance(v, dict):
-            add_args(parser, v, prefix + k + '.')
-        elif isinstance(v, abc.Iterable):
-            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
-        else:
-            print(f'cannot parse key {prefix + k} of type {type(v)}')
-    return parser
-
-
-class Config:
-    """A facility for config and config files.
-
-    It supports common file formats as configs: python/json/yaml. The interface
-    is the same as a dict object and also allows access config values as
-    attributes.
-
-    Example:
-        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
-        >>> cfg.a
-        1
-        >>> cfg.b
-        {'b1': [0, 1]}
-        >>> cfg.b.b1
-        [0, 1]
-        >>> cfg = Config.fromfile('tests/data/config/a.py')
-        >>> cfg.filename
-        "/home/kchen/projects/mmcv/tests/data/config/a.py"
-        >>> cfg.item4
-        'test'
-        >>> cfg
-        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
-        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
-    """
-
-    @staticmethod
-    def _validate_py_syntax(filename):
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            content = f.read()
-        try:
-            ast.parse(content)
-        except SyntaxError as e:
-            raise SyntaxError('There are syntax errors in config '
-                              f'file {filename}: {e}')
-
-    @staticmethod
-    def _substitute_predefined_vars(filename, temp_config_name):
-        file_dirname = osp.dirname(filename)
-        file_basename = osp.basename(filename)
-        file_basename_no_extension = osp.splitext(file_basename)[0]
-        file_extname = osp.splitext(filename)[1]
-        support_templates = dict(
-            fileDirname=file_dirname,
-            fileBasename=file_basename,
-            fileBasenameNoExtension=file_basename_no_extension,
-            fileExtname=file_extname)
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            config_file = f.read()
-        for key, value in support_templates.items():
-            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
-            value = value.replace('\\', '/')
-            config_file = re.sub(regexp, value, config_file)
-        with open(temp_config_name, 'w') as tmp_config_file:
-            tmp_config_file.write(config_file)
-
-    @staticmethod
-    def _pre_substitute_base_vars(filename, temp_config_name):
-        """Substitute base variable placehoders to string, so that parsing
-        would work."""
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            config_file = f.read()
-        base_var_dict = {}
-        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
-        base_vars = set(re.findall(regexp, config_file))
-        for base_var in base_vars:
-            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
-            base_var_dict[randstr] = base_var
-            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
-            config_file = re.sub(regexp, f'"{randstr}"', config_file)
-        with open(temp_config_name, 'w') as tmp_config_file:
-            tmp_config_file.write(config_file)
-        return base_var_dict
-
-    @staticmethod
-    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
-        """Substitute variable strings to their actual values."""
-        cfg = copy.deepcopy(cfg)
-
-        if isinstance(cfg, dict):
-            for k, v in cfg.items():
-                if isinstance(v, str) and v in base_var_dict:
-                    new_v = base_cfg
-                    for new_k in base_var_dict[v].split('.'):
-                        new_v = new_v[new_k]
-                    cfg[k] = new_v
-                elif isinstance(v, (list, tuple, dict)):
-                    cfg[k] = Config._substitute_base_vars(
-                        v, base_var_dict, base_cfg)
-        elif isinstance(cfg, tuple):
-            cfg = tuple(
-                Config._substitute_base_vars(c, base_var_dict, base_cfg)
-                for c in cfg)
-        elif isinstance(cfg, list):
-            cfg = [
-                Config._substitute_base_vars(c, base_var_dict, base_cfg)
-                for c in cfg
-            ]
-        elif isinstance(cfg, str) and cfg in base_var_dict:
-            new_v = base_cfg
-            for new_k in base_var_dict[cfg].split('.'):
-                new_v = new_v[new_k]
-            cfg = new_v
-
-        return cfg
-
-    @staticmethod
-    def _file2dict(filename, use_predefined_variables=True):
-        filename = osp.abspath(osp.expanduser(filename))
-        check_file_exist(filename)
-        fileExtname = osp.splitext(filename)[1]
-        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
-
-        with tempfile.TemporaryDirectory() as temp_config_dir:
-            temp_config_file = tempfile.NamedTemporaryFile(
-                dir=temp_config_dir, suffix=fileExtname)
-            if platform.system() == 'Windows':
-                temp_config_file.close()
-            temp_config_name = osp.basename(temp_config_file.name)
-            # Substitute predefined variables
-            if use_predefined_variables:
-                Config._substitute_predefined_vars(filename,
-                                                   temp_config_file.name)
-            else:
-                shutil.copyfile(filename, temp_config_file.name)
-            # Substitute base variables from placeholders to strings
-            base_var_dict = Config._pre_substitute_base_vars(
-                temp_config_file.name, temp_config_file.name)
-
-            if filename.endswith('.py'):
-                temp_module_name = osp.splitext(temp_config_name)[0]
-                sys.path.insert(0, temp_config_dir)
-                Config._validate_py_syntax(filename)
-                mod = import_module(temp_module_name)
-                sys.path.pop(0)
-                cfg_dict = {
-                    name: value
-                    for name, value in mod.__dict__.items()
-                    if not name.startswith('__')
-                }
-                # delete imported module
-                del sys.modules[temp_module_name]
-            elif filename.endswith(('.yml', '.yaml', '.json')):
-                import mmcv
-                cfg_dict = mmcv.load(temp_config_file.name)
-            # close temp file
-            temp_config_file.close()
-
-        cfg_text = filename + '\n'
-        with open(filename, 'r', encoding='utf-8') as f:
-            # Setting encoding explicitly to resolve coding issue on windows
-            cfg_text += f.read()
-
-        if BASE_KEY in cfg_dict:
-            cfg_dir = osp.dirname(filename)
-            base_filename = cfg_dict.pop(BASE_KEY)
-            base_filename = base_filename if isinstance(
-                base_filename, list) else [base_filename]
-
-            cfg_dict_list = list()
-            cfg_text_list = list()
-            for f in base_filename:
-                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
-                cfg_dict_list.append(_cfg_dict)
-                cfg_text_list.append(_cfg_text)
-
-            base_cfg_dict = dict()
-            for c in cfg_dict_list:
-                if len(base_cfg_dict.keys() & c.keys()) > 0:
-                    raise KeyError('Duplicate key is not allowed among bases')
-                base_cfg_dict.update(c)
-
-            # Subtitute base variables from strings to their actual values
-            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
-                                                    base_cfg_dict)
-
-            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
-            cfg_dict = base_cfg_dict
-
-            # merge cfg_text
-            cfg_text_list.append(cfg_text)
-            cfg_text = '\n'.join(cfg_text_list)
-
-        return cfg_dict, cfg_text
-
-    @staticmethod
-    def _merge_a_into_b(a, b, allow_list_keys=False):
-        """merge dict ``a`` into dict ``b`` (non-inplace).
-
-        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
-        in-place modifications.
-
-        Args:
-            a (dict): The source dict to be merged into ``b``.
-            b (dict): The origin dict to be fetch keys from ``a``.
-            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
-              are allowed in source ``a`` and will replace the element of the
-              corresponding index in b if b is a list. Default: False.
-
-        Returns:
-            dict: The modified dict of ``b`` using ``a``.
-
-        Examples:
-            # Normally merge a into b.
-            >>> Config._merge_a_into_b(
-            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
-            {'obj': {'a': 2}}
-
-            # Delete b first and merge a into b.
-            >>> Config._merge_a_into_b(
-            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
-            {'obj': {'a': 2}}
-
-            # b is a list
-            >>> Config._merge_a_into_b(
-            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
-            [{'a': 2}, {'b': 2}]
-        """
-        b = b.copy()
-        for k, v in a.items():
-            if allow_list_keys and k.isdigit() and isinstance(b, list):
-                k = int(k)
-                if len(b) <= k:
-                    raise KeyError(f'Index {k} exceeds the length of list {b}')
-                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
-            elif isinstance(v,
-                            dict) and k in b and not v.pop(DELETE_KEY, False):
-                allowed_types = (dict, list) if allow_list_keys else dict
-                if not isinstance(b[k], allowed_types):
-                    raise TypeError(
-                        f'{k}={v} in child config cannot inherit from base '
-                        f'because {k} is a dict in the child config but is of '
-                        f'type {type(b[k])} in base config. You may set '
-                        f'`{DELETE_KEY}=True` to ignore the base config')
-                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
-            else:
-                b[k] = v
-        return b
-
-    @staticmethod
-    def fromfile(filename,
-                 use_predefined_variables=True,
-                 import_custom_modules=True):
-        cfg_dict, cfg_text = Config._file2dict(filename,
-                                               use_predefined_variables)
-        if import_custom_modules and cfg_dict.get('custom_imports', None):
-            import_modules_from_strings(**cfg_dict['custom_imports'])
-        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
-
-    @staticmethod
-    def fromstring(cfg_str, file_format):
-        """Generate config from config str.
-
-        Args:
-            cfg_str (str): Config str.
-            file_format (str): Config file format corresponding to the
-               config str. Only py/yml/yaml/json type are supported now!
-
-        Returns:
-            obj:`Config`: Config obj.
-        """
-        if file_format not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
-        if file_format != '.py' and 'dict(' in cfg_str:
-            # check if users specify a wrong suffix for python
-            warnings.warn(
-                'Please check "file_format", the file format may be .py')
-        with tempfile.NamedTemporaryFile(
-                'w', suffix=file_format, delete=False) as temp_file:
-            temp_file.write(cfg_str)
-            # on windows, previous implementation cause error
-            # see PR 1077 for details
-        cfg = Config.fromfile(temp_file.name)
-        os.remove(temp_file.name)
-        return cfg
-
-    @staticmethod
-    def auto_argparser(description=None):
-        """Generate argparser from config file automatically (experimental)"""
-        partial_parser = ArgumentParser(description=description)
-        partial_parser.add_argument('config', help='config file path')
-        cfg_file = partial_parser.parse_known_args()[0].config
-        cfg = Config.fromfile(cfg_file)
-        parser = ArgumentParser(description=description)
-        parser.add_argument('config', help='config file path')
-        add_args(parser, cfg)
-        return parser, cfg
-
-    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
-        if cfg_dict is None:
-            cfg_dict = dict()
-        elif not isinstance(cfg_dict, dict):
-            raise TypeError('cfg_dict must be a dict, but '
-                            f'got {type(cfg_dict)}')
-        for key in cfg_dict:
-            if key in RESERVED_KEYS:
-                raise KeyError(f'{key} is reserved for config file')
-
-        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
-        super(Config, self).__setattr__('_filename', filename)
-        if cfg_text:
-            text = cfg_text
-        elif filename:
-            with open(filename, 'r') as f:
-                text = f.read()
-        else:
-            text = ''
-        super(Config, self).__setattr__('_text', text)
-
-    @property
-    def filename(self):
-        return self._filename
-
-    @property
-    def text(self):
-        return self._text
-
-    @property
-    def pretty_text(self):
-
-        indent = 4
-
-        def _indent(s_, num_spaces):
-            s = s_.split('\n')
-            if len(s) == 1:
-                return s_
-            first = s.pop(0)
-            s = [(num_spaces * ' ') + line for line in s]
-            s = '\n'.join(s)
-            s = first + '\n' + s
-            return s
-
-        def _format_basic_types(k, v, use_mapping=False):
-            if isinstance(v, str):
-                v_str = f"'{v}'"
-            else:
-                v_str = str(v)
-
-            if use_mapping:
-                k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                attr_str = f'{k_str}: {v_str}'
-            else:
-                attr_str = f'{str(k)}={v_str}'
-            attr_str = _indent(attr_str, indent)
-
-            return attr_str
-
-        def _format_list(k, v, use_mapping=False):
-            # check if all items in the list are dict
-            if all(isinstance(_, dict) for _ in v):
-                v_str = '[\n'
-                v_str += '\n'.join(
-                    f'dict({_indent(_format_dict(v_), indent)}),'
-                    for v_ in v).rstrip(',')
-                if use_mapping:
-                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                    attr_str = f'{k_str}: {v_str}'
-                else:
-                    attr_str = f'{str(k)}={v_str}'
-                attr_str = _indent(attr_str, indent) + ']'
-            else:
-                attr_str = _format_basic_types(k, v, use_mapping)
-            return attr_str
-
-        def _contain_invalid_identifier(dict_str):
-            contain_invalid_identifier = False
-            for key_name in dict_str:
-                contain_invalid_identifier |= \
-                    (not str(key_name).isidentifier())
-            return contain_invalid_identifier
-
-        def _format_dict(input_dict, outest_level=False):
-            r = ''
-            s = []
-
-            use_mapping = _contain_invalid_identifier(input_dict)
-            if use_mapping:
-                r += '{'
-            for idx, (k, v) in enumerate(input_dict.items()):
-                is_last = idx >= len(input_dict) - 1
-                end = '' if outest_level or is_last else ','
-                if isinstance(v, dict):
-                    v_str = '\n' + _format_dict(v)
-                    if use_mapping:
-                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
-                        attr_str = f'{k_str}: dict({v_str}'
-                    else:
-                        attr_str = f'{str(k)}=dict({v_str}'
-                    attr_str = _indent(attr_str, indent) + ')' + end
-                elif isinstance(v, list):
-                    attr_str = _format_list(k, v, use_mapping) + end
-                else:
-                    attr_str = _format_basic_types(k, v, use_mapping) + end
-
-                s.append(attr_str)
-            r += '\n'.join(s)
-            if use_mapping:
-                r += '}'
-            return r
-
-        cfg_dict = self._cfg_dict.to_dict()
-        text = _format_dict(cfg_dict, outest_level=True)
-        # copied from setup.cfg
-        yapf_style = dict(
-            based_on_style='pep8',
-            blank_line_before_nested_class_or_def=True,
-            split_before_expression_after_opening_paren=True)
-        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
-
-        return text
-
-    def __repr__(self):
-        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
-
-    def __len__(self):
-        return len(self._cfg_dict)
-
-    def __getattr__(self, name):
-        return getattr(self._cfg_dict, name)
-
-    def __getitem__(self, name):
-        return self._cfg_dict.__getitem__(name)
-
-    def __setattr__(self, name, value):
-        if isinstance(value, dict):
-            value = ConfigDict(value)
-        self._cfg_dict.__setattr__(name, value)
-
-    def __setitem__(self, name, value):
-        if isinstance(value, dict):
-            value = ConfigDict(value)
-        self._cfg_dict.__setitem__(name, value)
-
-    def __iter__(self):
-        return iter(self._cfg_dict)
-
-    def __getstate__(self):
-        return (self._cfg_dict, self._filename, self._text)
-
-    def __setstate__(self, state):
-        _cfg_dict, _filename, _text = state
-        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
-        super(Config, self).__setattr__('_filename', _filename)
-        super(Config, self).__setattr__('_text', _text)
-
-    def dump(self, file=None):
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
-        if self.filename.endswith('.py'):
-            if file is None:
-                return self.pretty_text
-            else:
-                with open(file, 'w') as f:
-                    f.write(self.pretty_text)
-        else:
-            import mmcv
-            if file is None:
-                file_format = self.filename.split('.')[-1]
-                return mmcv.dump(cfg_dict, file_format=file_format)
-            else:
-                mmcv.dump(cfg_dict, file)
-
-    def merge_from_dict(self, options, allow_list_keys=True):
-        """Merge list into cfg_dict.
-
-        Merge the dict parsed by MultipleKVAction into this cfg.
-
-        Examples:
-            >>> options = {'model.backbone.depth': 50,
-            ...            'model.backbone.with_cp':True}
-            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
-            >>> cfg.merge_from_dict(options)
-            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-            >>> assert cfg_dict == dict(
-            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
-
-            # Merge list element
-            >>> cfg = Config(dict(pipeline=[
-            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
-            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
-            >>> cfg.merge_from_dict(options, allow_list_keys=True)
-            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-            >>> assert cfg_dict == dict(pipeline=[
-            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
-
-        Args:
-            options (dict): dict of configs to merge from.
-            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
-              are allowed in ``options`` and will replace the element of the
-              corresponding index in the config if the config is a list.
-              Default: True.
-        """
-        option_cfg_dict = {}
-        for full_key, v in options.items():
-            d = option_cfg_dict
-            key_list = full_key.split('.')
-            for subkey in key_list[:-1]:
-                d.setdefault(subkey, ConfigDict())
-                d = d[subkey]
-            subkey = key_list[-1]
-            d[subkey] = v
-
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-        super(Config, self).__setattr__(
-            '_cfg_dict',
-            Config._merge_a_into_b(
-                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
-
-
-class DictAction(Action):
-    """
-    argparse action to split an argument into KEY=VALUE form
-    on the first = and append to a dictionary. List options can
-    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
-    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
-    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
-    """
-
-    @staticmethod
-    def _parse_int_float_bool(val):
-        try:
-            return int(val)
-        except ValueError:
-            pass
-        try:
-            return float(val)
-        except ValueError:
-            pass
-        if val.lower() in ['true', 'false']:
-            return True if val.lower() == 'true' else False
-        return val
-
-    @staticmethod
-    def _parse_iterable(val):
-        """Parse iterable values in the string.
-
-        All elements inside '()' or '[]' are treated as iterable values.
-
-        Args:
-            val (str): Value string.
-
-        Returns:
-            list | tuple: The expanded list or tuple from the string.
-
-        Examples:
-            >>> DictAction._parse_iterable('1,2,3')
-            [1, 2, 3]
-            >>> DictAction._parse_iterable('[a, b, c]')
-            ['a', 'b', 'c']
-            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
-            [(1, 2, 3), ['a', 'b], 'c']
-        """
-
-        def find_next_comma(string):
-            """Find the position of next comma in the string.
-
-            If no ',' is found in the string, return the string length. All
-            chars inside '()' and '[]' are treated as one element and thus ','
-            inside these brackets are ignored.
-            """
-            assert (string.count('(') == string.count(')')) and (
-                    string.count('[') == string.count(']')), \
-                f'Imbalanced brackets exist in {string}'
-            end = len(string)
-            for idx, char in enumerate(string):
-                pre = string[:idx]
-                # The string before this ',' is balanced
-                if ((char == ',') and (pre.count('(') == pre.count(')'))
-                        and (pre.count('[') == pre.count(']'))):
-                    end = idx
-                    break
-            return end
-
-        # Strip ' and " characters and replace whitespace.
-        val = val.strip('\'\"').replace(' ', '')
-        is_tuple = False
-        if val.startswith('(') and val.endswith(')'):
-            is_tuple = True
-            val = val[1:-1]
-        elif val.startswith('[') and val.endswith(']'):
-            val = val[1:-1]
-        elif ',' not in val:
-            # val is a single value
-            return DictAction._parse_int_float_bool(val)
-
-        values = []
-        while len(val) > 0:
-            comma_idx = find_next_comma(val)
-            element = DictAction._parse_iterable(val[:comma_idx])
-            values.append(element)
-            val = val[comma_idx + 1:]
-        if is_tuple:
-            values = tuple(values)
-        return values
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        options = {}
-        for kv in values:
-            key, val = kv.split('=', maxsplit=1)
-            options[key] = self._parse_iterable(val)
-        setattr(namespace, self.dest, options)
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/default_runtime.py b/AVLFormer/src/modeling/video_swin/default_runtime.py
deleted file mode 100644
index 5aaf9f2..0000000
--- a/AVLFormer/src/modeling/video_swin/default_runtime.py
+++ /dev/null
@@ -1,13 +0,0 @@
-checkpoint_config = dict(interval=1)
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_base.py b/AVLFormer/src/modeling/video_swin/swin_base.py
deleted file mode 100644
index b0887e6..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_base.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# model settings
-_base_ = "swin_tiny.py"
-model = dict(backbone=dict(depths=[2, 2, 18, 2],
-                           embed_dim=128,
-                           num_heads=[4, 8, 16, 32]),
-             cls_head=dict(in_channels=1024))
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window1677_sthv2.py b/AVLFormer/src/modeling/video_swin/swin_base_patch244_window1677_sthv2.py
deleted file mode 100644
index e9c73de..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window1677_sthv2.py
+++ /dev/null
@@ -1,130 +0,0 @@
-_base_ = [
-    'swin_base.py', 'default_runtime.py'
-]
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/sthv2/videos'
-data_root_val = 'data/sthv2/videos'
-ann_file_train = 'data/sthv2/sthv2_train_list_videos.txt'
-ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt'
-ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1, frame_uniform=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Imgaug', transforms=[dict(type='RandAugment', n=4, m=7)]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='RandomErasing', probability=0.25),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        frame_uniform=True,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        frame_uniform=True,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=1,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 60
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/sthv2_swin_base_patch244_window1677.py'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
-
-model=dict(backbone=dict(patch_size=(2,4,4), window_size=(16,7,7), drop_path_rate=0.4),
-           cls_head=dict(num_classes=174),
-           test_cfg=dict(max_testing_views=2), 
-           train_cfg=dict(blending=dict(type='LabelSmoothing', num_classes=174, smoothing=0.1)))
diff --git a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_1k.py b/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_1k.py
deleted file mode 100644
index 52aae1c..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_1k.py
+++ /dev/null
@@ -1,122 +0,0 @@
-_base_ = [
-    'swin_base.py', 'default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.3), test_cfg=dict(max_testing_views=4))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 30
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/k400_swin_base_patch244_window877.py'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
diff --git a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_22k.py b/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_22k.py
deleted file mode 100644
index 8a82860..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics400_22k.py
+++ /dev/null
@@ -1,122 +0,0 @@
-_base_ = [
-    'swin_base.py', 'default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.2), test_cfg=dict(max_testing_views=2))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 30
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/k400_swin_base_22k_patch244_window877.py'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
diff --git a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics600_22k.py b/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics600_22k.py
deleted file mode 100644
index a10bd1c..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_base_patch244_window877_kinetics600_22k.py
+++ /dev/null
@@ -1,20 +0,0 @@
-_base_ = "swin_base_patch244_window877_kinetics400_22k.py"
-
-data_root = 'data/kinetics600/train'
-data_root_val = 'data/kinetics600/val'
-ann_file_train = 'data/kinetics600/kinetics600_train_list.txt'
-ann_file_val = 'data/kinetics600/kinetics600_val_list.txt'
-ann_file_test = 'data/kinetics600/kinetics600_val_list.txt'
-
-data = dict(
-    train=dict(
-        ann_file=ann_file_train,
-        data_prefix=data_root),
-    val=dict(
-        ann_file=ann_file_val,
-        data_prefix=data_root_val),
-    test=dict(
-        ann_file=ann_file_test,
-        data_prefix=data_root_val))
-
-model=dict(cls_head=dict(num_classes=600))
diff --git a/AVLFormer/src/modeling/video_swin/swin_large.py b/AVLFormer/src/modeling/video_swin/swin_large.py
deleted file mode 100644
index 789b105..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_large.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# model settings
-_base_ = "swin_tiny.py"
-model = dict(backbone=dict(depths=[2, 2, 18, 2],
-                           embed_dim=192,
-                           num_heads=[6, 12, 24, 48]),
-             cls_head=dict(in_channels=1536))
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics400_22k.py b/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics400_22k.py
deleted file mode 100644
index 1c9c0b1..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics400_22k.py
+++ /dev/null
@@ -1,124 +0,0 @@
-_base_ = [
-    'swin_large.py', 'default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), window_size=(8,12,12), drop_path_rate=0.5), test_cfg=dict(max_testing_views=1), train_cfg=dict(blending=dict(type='LabelSmoothing', num_classes=400, smoothing=0.1)))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 416)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(384, 384), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Imgaug', transforms=[dict(type='RandAugment', n=4, m=7)]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='RandomErasing', probability=0.25),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 416)),
-    dict(type='CenterCrop', crop_size=384),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 384)),
-    dict(type='ThreeCrop', crop_size=384),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=1,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 60
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/swin_large_384_patch244_window81212_kinetics400_22k'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics600_22k.py b/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics600_22k.py
deleted file mode 100644
index efece5f..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_large_384_patch244_window81212_kinetics600_22k.py
+++ /dev/null
@@ -1,124 +0,0 @@
-_base_ = [
-    'swin_large.py', 'default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), window_size=(8,12,12), drop_path_rate=0.4), test_cfg=dict(max_testing_views=1), cls_head=dict(num_classes=600), train_cfg=dict(blending=dict(type='LabelSmoothing', num_classes=600, smoothing=0.1)))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics600/train'
-data_root_val = 'data/kinetics600/val'
-ann_file_train = 'data/kinetics600/kinetics600_train_list.txt'
-ann_file_val = 'data/kinetics600/kinetics600_val_list.txt'
-ann_file_test = 'data/kinetics600/kinetics600_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 416)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(384, 384), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Imgaug', transforms=[dict(type='RandAugment', n=4, m=7)]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='RandomErasing', probability=0.25),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 416)),
-    dict(type='CenterCrop', crop_size=384),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 384)),
-    dict(type='ThreeCrop', crop_size=384),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=1,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 60
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/swin_large_384_patch244_window81212_kinetics600_22k'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_large_patch244_window877_kinetics400_22k.py b/AVLFormer/src/modeling/video_swin/swin_large_patch244_window877_kinetics400_22k.py
deleted file mode 100644
index 429ed81..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_large_patch244_window877_kinetics400_22k.py
+++ /dev/null
@@ -1,122 +0,0 @@
-_base_ = [
-    'swin_large.py', 'default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.2), test_cfg=dict(max_testing_views=1))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=1,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 30
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/swin_large_patch244_window877_kinetics400_22k'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_small_patch244_window877_kinetics400_1k.py b/AVLFormer/src/modeling/video_swin/swin_small_patch244_window877_kinetics400_1k.py
deleted file mode 100644
index a3bd913..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_small_patch244_window877_kinetics400_1k.py
+++ /dev/null
@@ -1,122 +0,0 @@
-_base_ = [
-    '../../_base_/models/swin/swin_small.py', '../../_base_/default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.1), test_cfg=dict(max_testing_views=4))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 30
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/k400_swin_small_patch244_window877.py'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=8,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
diff --git a/AVLFormer/src/modeling/video_swin/swin_tiny.py b/AVLFormer/src/modeling/video_swin/swin_tiny.py
deleted file mode 100644
index 69731a3..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_tiny.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# model settings
-model = dict(
-    type='Recognizer3D',
-    backbone=dict(
-        type='SwinTransformer3D',
-        patch_size=(4,4,4),
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=(8,7,7),
-        mlp_ratio=4.,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.2,
-        patch_norm=True),
-    cls_head=dict(
-        type='I3DHead',
-        in_channels=768,
-        num_classes=400,
-        spatial_type='avg',
-        dropout_ratio=0.5),
-    test_cfg = dict(average_clips='prob'))
\ No newline at end of file
diff --git a/AVLFormer/src/modeling/video_swin/swin_tiny_patch244_window877_kinetics400_1k.py b/AVLFormer/src/modeling/video_swin/swin_tiny_patch244_window877_kinetics400_1k.py
deleted file mode 100644
index 433d06e..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_tiny_patch244_window877_kinetics400_1k.py
+++ /dev/null
@@ -1,122 +0,0 @@
-_base_ = [
-    '../../_base_/models/swin/swin_tiny.py', '../../_base_/default_runtime.py'
-]
-model=dict(backbone=dict(patch_size=(2,4,4), drop_path_rate=0.1), test_cfg=dict(max_testing_views=4))
-
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/train'
-data_root_val = 'data/kinetics400/val'
-ann_file_train = 'data/kinetics400/kinetics400_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    val_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    test_dataloader=dict(
-        videos_per_gpu=1,
-        workers_per_gpu=1
-    ),
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
-
-# optimizer
-optimizer = dict(type='AdamW', lr=1e-3, betas=(0.9, 0.999), weight_decay=0.02,
-                 paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
-                                                 'relative_position_bias_table': dict(decay_mult=0.),
-                                                 'norm': dict(decay_mult=0.),
-                                                 'backbone': dict(lr_mult=0.1)}))
-# learning policy
-lr_config = dict(
-    policy='CosineAnnealing',
-    min_lr=0,
-    warmup='linear',
-    warmup_by_epoch=True,
-    warmup_iters=2.5
-)
-total_epochs = 30
-
-# runtime settings
-checkpoint_config = dict(interval=1)
-work_dir = work_dir = './work_dirs/k400_swin_tiny_patch244_window877.py'
-find_unused_parameters = False
-
-
-# do not use mmdet version fp16
-fp16 = None
-optimizer_config = dict(
-    type="DistOptimizerHook",
-    update_interval=4,
-    grad_clip=None,
-    coalesce=True,
-    bucket_size_mb=-1,
-    use_fp16=True,
-)
diff --git a/AVLFormer/src/modeling/video_swin/swin_transformer.py b/AVLFormer/src/modeling/video_swin/swin_transformer.py
deleted file mode 100644
index 8e9ff16..0000000
--- a/AVLFormer/src/modeling/video_swin/swin_transformer.py
+++ /dev/null
@@ -1,700 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-import numpy as np
-from src.timm.models.layers import DropPath, trunc_normal_
-
-# from mmcv.runner import load_checkpoint
-
-from functools import reduce, lru_cache
-from operator import mul
-from einops import rearrange
-
-
-def _get_checkpoint_loader(cls, path):
-    """Finds a loader that supports the given path. Falls back to the local
-    loader if no other loader is found.
-    Args:
-        path (str): checkpoint path
-    Returns:
-        loader (function): checkpoint loader
-    """
-
-    for p in cls._schemes:
-        if path.startswith(p):
-            return cls._schemes[p]
-
-def load_checkpoint(cls, filename, map_location=None):
-    """load checkpoint through URL scheme path.
-    Args:
-        filename (str): checkpoint file name with given prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-            Default: None
-        logger (:mod:`logging.Logger`, optional): The logger for message.
-            Default: None
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    checkpoint_loader = cls._get_checkpoint_loader(filename)
-    class_name = checkpoint_loader.__name__
-    return checkpoint_loader(filename, map_location)
-
-
-
-class Mlp(nn.Module):
-    """ Multilayer perceptron."""
-
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, D, H, W, C)
-        window_size (tuple[int]): window size
-
-    Returns:
-        windows: (B*num_windows, window_size*window_size, C)
-    """
-    B, D, H, W, C = x.shape
-    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1], window_size[1], W // window_size[2], window_size[2], C)
-    windows = x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, reduce(mul, window_size), C)
-    return windows
-
-
-def window_reverse(windows, window_size, B, D, H, W):
-    """
-    Args:
-        windows: (B*num_windows, window_size, window_size, C)
-        window_size (tuple[int]): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, D, H, W, C)
-    """
-    x = windows.view(B, D // window_size[0], H // window_size[1], W // window_size[2], window_size[0], window_size[1], window_size[2], -1)
-    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
-    return x
-
-
-
-
-def get_window_size(x_size, window_size, shift_size=None):
-    use_window_size = list(window_size)
-    if shift_size is not None:
-        use_shift_size = list(shift_size)
-    for i in range(len(x_size)):
-        if x_size[i] <= window_size[i]:
-            use_window_size[i] = x_size[i]
-            if shift_size is not None:
-                use_shift_size[i] = 0
-
-    if shift_size is None:
-        return tuple(use_window_size)
-    else:
-        return tuple(use_window_size), tuple(use_shift_size)
-
-
-class WindowAttention3D(nn.Module):
-    """ Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The temporal length, height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wd, Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1) * (2 * window_size[2] - 1), num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_d = torch.arange(self.window_size[0])
-        coords_h = torch.arange(self.window_size[1])
-        coords_w = torch.arange(self.window_size[2])
-        coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))  # 3, Wd, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 2] += self.window_size[2] - 1
-
-        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
-        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
-        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """ Forward function.
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
-
-        q = q * self.scale
-        attn = q @ k.transpose(-2, -1)
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index[:N, :N].reshape(-1)].reshape(
-            N, N, -1)  # Wd*Wh*Ww,Wd*Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wd*Wh*Ww, Wd*Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class SwinTransformerBlock3D(nn.Module):
-    """ Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (tuple[int]): Window size.
-        shift_size (tuple[int]): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, num_heads, window_size=(2,7,7), shift_size=(0,0,0),
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_checkpoint=False):
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        self.use_checkpoint=use_checkpoint
-
-        assert 0 <= self.shift_size[0] < self.window_size[0], "shift_size must in 0-window_size"
-        assert 0 <= self.shift_size[1] < self.window_size[1], "shift_size must in 0-window_size"
-        assert 0 <= self.shift_size[2] < self.window_size[2], "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention3D(
-            dim, window_size=self.window_size, num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-    def forward_part1(self, x, mask_matrix):
-        B, D, H, W, C = x.shape
-        window_size, shift_size = get_window_size((D, H, W), self.window_size, self.shift_size)
-
-        x = self.norm1(x)
-        # pad feature maps to multiples of window size
-        pad_l = pad_t = pad_d0 = 0
-        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
-        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
-        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
-        _, Dp, Hp, Wp, _ = x.shape
-        # cyclic shift
-        if any(i > 0 for i in shift_size):
-            shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
-            attn_mask = mask_matrix
-        else:
-            shifted_x = x
-            attn_mask = None
-        # partition windows
-        x_windows = window_partition(shifted_x, window_size)  # B*nW, Wd*Wh*Ww, C
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
-        # merge windows
-        attn_windows = attn_windows.view(-1, *(window_size+(C,)))
-        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, Wp)  # B D' H' W' C
-        # reverse cyclic shift
-        if any(i > 0 for i in shift_size):
-            x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
-        else:
-            x = shifted_x
-
-        if pad_d1 >0 or pad_r > 0 or pad_b > 0:
-            x = x[:, :D, :H, :W, :].contiguous()
-        return x
-
-    def forward_part2(self, x):
-        return self.drop_path(self.mlp(self.norm2(x)))
-
-    def forward(self, x, mask_matrix):
-        """ Forward function.
-
-        Args:
-            x: Input feature, tensor size (B, D, H, W, C).
-            mask_matrix: Attention mask for cyclic shift.
-        """
-
-        shortcut = x
-        if self.use_checkpoint:
-            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
-        else:
-            x = self.forward_part1(x, mask_matrix)
-        x = shortcut + self.drop_path(x)
-
-        if self.use_checkpoint:
-            x = x + checkpoint.checkpoint(self.forward_part2, x)
-        else:
-            x = x + self.forward_part2(x)
-
-        return x
-
-
-class PatchMerging(nn.Module):
-    """ Patch Merging Layer
-
-    Args:
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-    def __init__(self, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """ Forward function.
-
-        Args:
-            x: Input feature, tensor size (B, D, H, W, C).
-        """
-        B, D, H, W, C = x.shape
-
-        # padding
-        pad_input = (H % 2 == 1) or (W % 2 == 1)
-        if pad_input:
-            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
-
-        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
-        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
-        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
-        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-
-# cache each stage results
-@lru_cache()
-def compute_mask(D, H, W, window_size, shift_size, device):
-    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
-    cnt = 0
-    for d in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0],None):
-        for h in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1],None):
-            for w in slice(-window_size[2]), slice(-window_size[2], -shift_size[2]), slice(-shift_size[2],None):
-                img_mask[:, d, h, w, :] = cnt
-                cnt += 1
-    mask_windows = window_partition(img_mask, window_size)  # nW, ws[0]*ws[1]*ws[2], 1
-    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
-    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-    return attn_mask
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of feature channels
-        depth (int): Depths of this stage.
-        num_heads (int): Number of attention head.
-        window_size (tuple[int]): Local window size. Default: (1,7,7).
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-    """
-
-    def __init__(self,
-                 dim,
-                 depth,
-                 num_heads,
-                 window_size=(1,7,7),
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None,
-                 use_checkpoint=False):
-        super().__init__()
-        self.window_size = window_size
-        self.shift_size = tuple(i // 2 for i in window_size)
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock3D(
-                dim=dim,
-                num_heads=num_heads,
-                window_size=window_size,
-                shift_size=(0,0,0) if (i % 2 == 0) else self.shift_size,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop,
-                attn_drop=attn_drop,
-                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                norm_layer=norm_layer,
-                use_checkpoint=use_checkpoint,
-            )
-            for i in range(depth)])
-        
-        self.downsample = downsample
-        if self.downsample is not None:
-            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
-
-    def forward(self, x):
-        """ Forward function.
-
-        Args:
-            x: Input feature, tensor size (B, C, D, H, W).
-        """
-        # calculate attention mask for SW-MSA
-        B, C, D, H, W = x.shape
-        window_size, shift_size = get_window_size((D,H,W), self.window_size, self.shift_size)
-        x = rearrange(x, 'b c d h w -> b d h w c')
-        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
-        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
-        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
-        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
-        for blk in self.blocks:
-            # safeguard fp16
-            attn_mask = attn_mask.to(dtype=x.dtype)
-            x = blk(x, attn_mask)
-        x = x.view(B, D, H, W, -1)
-
-        if self.downsample is not None:
-            x = self.downsample(x)
-        x = rearrange(x, 'b d h w c -> b c d h w')
-        return x
-
-
-class PatchEmbed3D(nn.Module):
-    """ Video to Patch Embedding.
-
-    Args:
-        patch_size (int): Patch token size. Default: (2,4,4).
-        in_chans (int): Number of input video channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-    def __init__(self, patch_size=(2,4,4), in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        self.patch_size = patch_size
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        """Forward function."""
-        # padding
-        _, _, D, H, W = x.size()
-        if W % self.patch_size[2] != 0:
-            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
-        if H % self.patch_size[1] != 0:
-            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
-        if D % self.patch_size[0] != 0:
-            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
-
-        x = self.proj(x)  # B C D Wh Ww
-        if self.norm is not None:
-            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
-            x = x.flatten(2).transpose(1, 2)
-            x = self.norm(x)
-            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
-
-        return x
-
-class SwinTransformer3D(nn.Module):
-    """ Swin Transformer backbone.
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-
-    Args:
-        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        depths (tuple[int]): Depths of each Swin Transformer stage.
-        num_heads (tuple[int]): Number of attention head of each stage.
-        window_size (int): Window size. Default: 7.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
-        drop_rate (float): Dropout rate.
-        attn_drop_rate (float): Attention dropout rate. Default: 0.
-        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
-        norm_layer: Normalization layer. Default: nn.LayerNorm.
-        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters.
-    """
-
-    def __init__(self,
-                 pretrained=None,
-                 pretrained2d=False,
-                 patch_size=(4,4,4),
-                 in_chans=3,
-                 embed_dim=96,
-                 depths=[2, 2, 6, 2],
-                 num_heads=[3, 6, 12, 24],
-                 window_size=(2,7,7),
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.2,
-                 norm_layer=nn.LayerNorm,
-                 patch_norm=False,
-                 frozen_stages=-1,
-                 use_checkpoint=False):
-        super().__init__()
-
-        self.pretrained = pretrained
-        self.pretrained2d = pretrained2d
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.patch_norm = patch_norm
-        self.frozen_stages = frozen_stages
-        self.window_size = window_size
-        self.patch_size = patch_size
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed3D(
-            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(
-                dim=int(embed_dim * 2**i_layer),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchMerging if i_layer<self.num_layers-1 else None,
-                use_checkpoint=use_checkpoint)
-            self.layers.append(layer)
-
-        self.num_features = int(embed_dim * 2**(self.num_layers-1))
-
-        # add a norm layer for each output
-        self.norm = norm_layer(self.num_features)
-
-        self._freeze_stages()
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            self.patch_embed.eval()
-            for param in self.patch_embed.parameters():
-                param.requires_grad = False
-
-        if self.frozen_stages >= 1:
-            self.pos_drop.eval()
-            for i in range(0, self.frozen_stages):
-                m = self.layers[i]
-                m.eval()
-                for param in m.parameters():
-                    param.requires_grad = False
-
-    def inflate_weights(self):
-        """Inflate the swin2d parameters to swin3d.
-
-        The differences between swin3d and swin2d mainly lie in an extra
-        axis. To utilize the pretrained parameters in 2d model,
-        the weight of swin2d models should be inflated to fit in the shapes of
-        the 3d counterpart.
-
-        Args:
-            logger (logging.Logger): The logger used to print
-                debugging infomation.
-        """
-        checkpoint = torch.load(self.pretrained, map_location='cpu')
-        state_dict = checkpoint['model']
-
-        # delete relative_position_index since we always re-init it
-        relative_position_index_keys = [k for k in state_dict.keys() if "relative_position_index" in k]
-        for k in relative_position_index_keys:
-            del state_dict[k]
-
-        # delete attn_mask since we always re-init it
-        attn_mask_keys = [k for k in state_dict.keys() if "attn_mask" in k]
-        for k in attn_mask_keys:
-            del state_dict[k]
-
-        state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).repeat(1,1,self.patch_size[0],1,1) / self.patch_size[0]
-
-        # bicubic interpolate relative_position_bias_table if not match
-        relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
-        for k in relative_position_bias_table_keys:
-            relative_position_bias_table_pretrained = state_dict[k]
-            relative_position_bias_table_current = self.state_dict()[k]
-            L1, nH1 = relative_position_bias_table_pretrained.size()
-            L2, nH2 = relative_position_bias_table_current.size()
-            L2 = (2*self.window_size[1]-1) * (2*self.window_size[2]-1)
-            wd = self.window_size[0]
-            if nH1 != nH2:
-                print(f"Error in loading {k}, passing")
-            else:
-                if L1 != L2:
-                    S1 = int(L1 ** 0.5)
-                    relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
-                        relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1), size=(2*self.window_size[1]-1, 2*self.window_size[2]-1),
-                        mode='bicubic')
-                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)
-            state_dict[k] = relative_position_bias_table_pretrained.repeat(2*wd-1,1)
-
-        msg = self.load_state_dict(state_dict, strict=False)
-        print(msg)
-        print(f"=> loaded successfully '{self.pretrained}'")
-        del checkpoint
-        torch.cuda.empty_cache()
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        def _init_weights(m):
-            if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=.02)
-                if isinstance(m, nn.Linear) and m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.LayerNorm):
-                nn.init.constant_(m.bias, 0)
-                nn.init.constant_(m.weight, 1.0)
-
-        if pretrained:
-            self.pretrained = pretrained
-        if isinstance(self.pretrained, str):
-            self.apply(_init_weights)
-            print(f'load model from: {self.pretrained}')
-
-            if self.pretrained2d:
-                print('Inflate 2D model into 3D model.')
-                # Inflate 2D model into 3D model.
-                self.inflate_weights()
-            else:
-                print('Directly load 3D model')
-                # Directly load 3D model.
-                load_checkpoint(self, self.pretrained, map_location=False)
-        elif self.pretrained is None:
-            self.apply(_init_weights)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.patch_embed(x)
-
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x.contiguous())
-
-        x = rearrange(x, 'n c d h w -> n d h w c')
-        x = self.norm(x)
-        x = rearrange(x, 'n d h w c -> n c d h w')
-
-        return x
-
-    def train(self, mode=True):
-        """Convert the model into training mode while keep layers freezed."""
-        super(SwinTransformer3D, self).train(mode)
-        self._freeze_stages()
-
diff --git a/AVLFormer/src/solver/LARC.py b/AVLFormer/src/solver/LARC.py
deleted file mode 100644
index 58cbb8c..0000000
--- a/AVLFormer/src/solver/LARC.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import torch
-from torch import nn
-from torch.nn.parameter import Parameter
-
-
-class LARC(object):
-    """
-    :class:`LARC` is a pytorch implementation of both the scaling and clipping variants of LARC,
-    in which the ratio between gradient and parameter magnitudes is used to calculate an adaptive 
-    local learning rate for each individual parameter. The algorithm is designed to improve
-    convergence of large batch training.
-     
-    See https://arxiv.org/abs/1708.03888 for calculation of the local learning rate.
-
-    In practice it modifies the gradients of parameters as a proxy for modifying the learning rate
-    of the parameters. This design allows it to be used as a wrapper around any torch.optim Optimizer.
-
-    ```
-    model = ...
-    optim = torch.optim.Adam(model.parameters(), lr=...)
-    optim = LARC(optim)
-    ```
-
-    It can even be used in conjunction with apex.fp16_utils.FP16_optimizer.
-
-    ```
-    model = ...
-    optim = torch.optim.Adam(model.parameters(), lr=...)
-    optim = LARC(optim)
-    optim = apex.fp16_utils.FP16_Optimizer(optim)
-    ```
-
-    Args:
-        optimizer: Pytorch optimizer to wrap and modify learning rate for.
-        trust_coefficient: Trust coefficient for calculating the lr. See https://arxiv.org/abs/1708.03888
-        clip: Decides between clipping or scaling mode of LARC. If `clip=True` the learning rate is set to `min(optimizer_lr, local_lr)` for each parameter. If `clip=False` the learning rate is set to `local_lr*optimizer_lr`.
-        eps: epsilon kludge to help with numerical stability while calculating adaptive_lr
-    """
-
-    def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
-        self.param_groups = optimizer.param_groups
-        self.optim = optimizer
-        self.trust_coefficient = trust_coefficient
-        self.eps = eps
-        self.clip = clip
-
-    def __getstate__(self):
-        return self.optim.__getstate__()
-
-    def __setstate__(self, state):
-        self.optim.__setstate__(state)
-
-    def __repr__(self):
-        return self.optim.__repr__()
-
-    def state_dict(self):
-        return self.optim.state_dict()
-
-    def load_state_dict(self, state_dict):
-        self.optim.load_state_dict(state_dict)
-
-    def zero_grad(self):
-        self.optim.zero_grad()
-
-    def add_param_group(self, param_group):
-        self.optim.add_param_group(param_group)
-
-    def step(self):
-        with torch.no_grad():
-            weight_decays = []
-            for group in self.optim.param_groups:
-                # absorb weight decay control from optimizer
-                weight_decay = group[
-                    'weight_decay'] if 'weight_decay' in group else 0
-                weight_decays.append(weight_decay)
-                group['weight_decay'] = 0
-                adlrs = []
-                for p in group['params']:
-                    if p.grad is None:
-                        continue
-                    param_norm = torch.norm(p.data)
-                    grad_norm = torch.norm(p.grad.data)
-
-                    if param_norm != 0 and grad_norm != 0:
-                        # calculate adaptive lr + weight decay
-                        adaptive_lr = self.trust_coefficient * (param_norm) / (
-                            grad_norm + param_norm * weight_decay + self.eps)
-
-                        # clip learning rate for LARC
-                        if self.clip:
-                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
-                            adaptive_lr = min(adaptive_lr / group['lr'], 1)
-                        adlrs.append(adaptive_lr)
-                        p.grad.data += weight_decay * p.data
-                        p.grad.data *= adaptive_lr
-                group['adaptive_lr'] = sum(adlrs) / len(adlrs) if len(
-                    adlrs) != 0 else 1
-
-        self.optim.step()
-        # return weight decay control to optimizer
-        for i, group in enumerate(self.optim.param_groups):
-            group['weight_decay'] = weight_decays[i]
diff --git a/AVLFormer/src/solver/__init__.py b/AVLFormer/src/solver/__init__.py
deleted file mode 100755
index 27771a8..0000000
--- a/AVLFormer/src/solver/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from .bertadam import BertAdam
-from .build import make_lr_scheduler, make_optimizer
-from .get_solver import get_optimizer, get_scheduler
-from .lr_scheduler import WarmupCosineAnnealingLR, WarmupLinearLR, WarmupMultiStepLR
-from .optimization import (
-    AdamW,
-    ConstantLRSchedule,
-    WarmupConstantSchedule,
-    WarmupCosineSchedule,
-    WarmupCosineWithHardRestartsSchedule,
-    WarmupLinearSchedule,
-    WarmupMultiStepSchedule,
-)
diff --git a/AVLFormer/src/solver/bertadam.py b/AVLFormer/src/solver/bertadam.py
deleted file mode 100644
index 3269d0a..0000000
--- a/AVLFormer/src/solver/bertadam.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for BERT model."""
-
-import math
-
-import torch
-from torch.nn.utils import clip_grad_norm_
-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-
-
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 0.5 * (1.0 + math.cos(math.pi * x))
-
-
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x / warmup
-    return 1.0
-
-
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x / warmup
-    return max((x - 1.) / (warmup - 1.), 0)
-
-
-SCHEDULES = {
-    'warmup_cosine': warmup_cosine,
-    'warmup_constant': warmup_constant,
-    'warmup_linear': warmup_linear,
-}
-
-
-class BertAdam(Optimizer):
-    """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
-        lr: learning rate
-        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
-        t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate. Default: -1
-        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
-        b1: Adams b1. Default: 0.9
-        b2: Adams b2. Default: 0.999
-        e: Adams epsilon. Default: 1e-6
-        weight_decay: Weight decay. Default: 0.01
-        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
-    """
-
-    def __init__(self,
-                 params,
-                 lr=required,
-                 warmup=-1,
-                 t_total=-1,
-                 schedule='warmup_linear',
-                 b1=0.9,
-                 b2=0.999,
-                 e=1e-6,
-                 weight_decay=0.01,
-                 max_grad_norm=1.0):
-        if lr is not required and lr < 0.0:
-            raise ValueError(
-                "Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError(
-                "Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(
-                    warmup))
-        if not 0.0 <= b1 < 1.0:
-            raise ValueError(
-                "Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(
-                    b1))
-        if not 0.0 <= b2 < 1.0:
-            raise ValueError(
-                "Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(
-                    b2))
-        if not e >= 0.0:
-            raise ValueError(
-                "Invalid epsilon value: {} - should be >= 0.0".format(e))
-        defaults = dict(lr=lr,
-                        schedule=schedule,
-                        warmup=warmup,
-                        t_total=t_total,
-                        b1=b1,
-                        b2=b2,
-                        e=e,
-                        weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
-        super(BertAdam, self).__init__(params, defaults)
-
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(
-                        state['step'] / group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
-                lr.append(lr_scheduled)
-        return lr
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        'Adam does not support sparse gradients, please consider SparseAdam instead'
-                    )
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['next_m'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['next_v'] = torch.zeros_like(p.data)
-
-                next_m, next_v = state['next_m'], state['next_v']
-                beta1, beta2 = group['b1'], group['b2']
-
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                # next_m.mul_(beta1).add_(1 - beta1, grad) --> pytorch 1.7
-                next_m.mul_(beta1).add_(grad, alpha=1 - beta1)
-                # next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) --> pytorch 1.7
-                next_v.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                update = next_m / (next_v.sqrt() + group['e'])
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay'] > 0.0:
-                    update += group['weight_decay'] * p.data
-
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step'] / group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(
-                        progress, group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
-
-                update_with_lr = lr_scheduled * update
-                p.data.add_(-update_with_lr)
-
-                state['step'] += 1
-
-        return loss
\ No newline at end of file
diff --git a/AVLFormer/src/solver/build.py b/AVLFormer/src/solver/build.py
deleted file mode 100755
index 47442a4..0000000
--- a/AVLFormer/src/solver/build.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-import re
-
-import torch
-
-from .LARC import LARC
-from .lr_scheduler import WarmupCosineAnnealingLR, WarmupMultiStepLR
-from .optimization import AdamW, WarmupLinearSchedule
-
-
-def make_optimizer(cfg, model, resume=False):
-    params = []
-    for key, value in model.named_parameters():
-        if not value.requires_grad:
-            continue
-        lr = cfg.SOLVER.BASE_LR
-        weight_decay = cfg.SOLVER.WEIGHT_DECAY
-
-        for reg_lr in cfg.SOLVER.REGEXP_LR_FACTOR:
-            regexp, lr_factor = reg_lr
-            if re.match(regexp, key):
-                if lr != cfg.SOLVER.BASE_LR:
-                    print("WARNING: {} matched multiple "
-                          "regular expressions!".format(key))
-                lr *= lr_factor
-
-        if "bias" in key:
-            lr *= cfg.SOLVER.BIAS_LR_FACTOR
-            weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
-
-        if resume:
-            params += [{
-                "params": [value],
-                "initial_lr": lr,
-                "lr": lr,
-                "weight_decay": weight_decay
-            }]
-        else:
-            params += [{
-                "params": [value],
-                "lr": lr,
-                "weight_decay": weight_decay
-            }]
-
-    if cfg.SOLVER.OPTIMIZER == 'sgd':
-        optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM)
-    elif cfg.SOLVER.OPTIMIZER == 'adam':
-        optimizer = torch.optim.Adam(params)
-    elif cfg.SOLVER.OPTIMIZER == 'adamw':
-        # optimizer = torch.optim.AdamW(params)
-        if hasattr(cfg, 'adam_epsilon'):
-            optimizer = AdamW(params, eps=cfg.adam_epsilon)
-        else:
-            optimizer = AdamW(params)
-    else:
-        raise ValueError('Optimizer "{}" is not supported'.format(
-            cfg.SOLVER.OPTIMIZER))
-    if cfg.SOLVER.USE_LARC:
-        optimizer = LARC(optimizer,
-                         clip=True,
-                         trust_coefficient=cfg.SOLVER.LARC_COEFFICIENT)
-    return optimizer
-
-
-def make_lr_scheduler(cfg, optimizer, last_iter=-1):
-    lr_policy = cfg.SOLVER.LR_POLICY
-    if lr_policy not in ("multistep", "cosine", 'linear'):
-        raise ValueError("Only 'multistep' or 'cosine' lr policy is accepted"
-                         "got {}".format(lr_policy))
-    if lr_policy == "multistep":
-        return WarmupMultiStepLR(optimizer,
-                                 cfg.SOLVER.STEPS,
-                                 cfg.SOLVER.GAMMA,
-                                 warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
-                                 warmup_iters=cfg.SOLVER.WARMUP_ITERS,
-                                 warmup_method=cfg.SOLVER.WARMUP_METHOD,
-                                 last_epoch=last_iter)
-    elif lr_policy == "cosine":
-        return WarmupCosineAnnealingLR(optimizer,
-                                       cfg.SOLVER.MAX_ITER,
-                                       cfg.SOLVER.MIN_LR,
-                                       warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
-                                       warmup_iters=cfg.SOLVER.WARMUP_ITERS,
-                                       warmup_method=cfg.SOLVER.WARMUP_METHOD,
-                                       last_epoch=last_iter)
-    elif lr_policy == "linear":
-        return WarmupLinearSchedule(
-            optimizer,
-            warmup_steps=cfg.SOLVER.WARMUP_ITERS,
-            t_total=cfg.SOLVER.MAX_ITER,
-        )
diff --git a/AVLFormer/src/solver/get_solver.py b/AVLFormer/src/solver/get_solver.py
deleted file mode 100644
index 5d9524f..0000000
--- a/AVLFormer/src/solver/get_solver.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from .optimization import (
-    AdamW,
-    WarmupConstantSchedule,
-    WarmupCosineSchedule,
-    WarmupLinearSchedule,
-)
-
-
-def get_optimizer(model, weight_decay, learning_rate, adam_epsilon):
-    no_decay = ['bias', 'LayerNorm.weight']
-    grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not \
-            any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
-        {'params': [p for n, p in model.named_parameters() if \
-            any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-    ]
-    return AdamW(grouped_parameters, lr=learning_rate, eps=adam_epsilon)
-
-
-def get_scheduler(optimizer, scheduler_type, warmup_steps, t_total):
-    if scheduler_type == "constant":
-        scheduler = WarmupConstantSchedule(optimizer,
-                                           warmup_steps=warmup_steps)
-    elif scheduler_type == "linear":
-        scheduler = WarmupLinearSchedule(optimizer,
-                                         warmup_steps=warmup_steps,
-                                         t_total=t_total)
-    elif scheduler_type == "cosine":
-        scheduler = WarmupCosineSchedule(optimizer,
-                                         warmup_steps=warmup_steps,
-                                         t_total=t_total)
-    else:
-        raise ValueError("Unknown scheduler type: {}".format(scheduler_type))
-    return scheduler
diff --git a/AVLFormer/src/solver/lr_scheduler.py b/AVLFormer/src/solver/lr_scheduler.py
deleted file mode 100755
index 9844700..0000000
--- a/AVLFormer/src/solver/lr_scheduler.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from bisect import bisect_right
-import math
-
-import torch
-
-from .LARC import LARC
-
-
-# FIXME ideally this would be achieved with a CombinedLRScheduler,
-# separating MultiStepLR with WarmupLR
-# but the current LRScheduler design doesn't allow it
-class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
-
-    def __init__(
-        self,
-        optimizer,
-        milestones,
-        gamma=0.1,
-        warmup_factor=1.0 / 3,
-        warmup_iters=500,
-        warmup_method="linear",
-        last_epoch=-1,
-    ):
-        if not list(milestones) == sorted(milestones):
-            raise ValueError(
-                "Milestones should be a list of"
-                " increasing integers. Got {}",
-                milestones,
-            )
-
-        if warmup_method not in ("constant", "linear"):
-            raise ValueError(
-                "Only 'constant' or 'linear' warmup_method accepted"
-                "got {}".format(warmup_method))
-        self.milestones = milestones
-        self.gamma = gamma
-        self.warmup_factor = warmup_factor
-        self.warmup_iters = warmup_iters
-        self.warmup_method = warmup_method
-        if isinstance(optimizer, LARC):
-            optimizer = optimizer.optim
-        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        warmup_factor = 1
-        if self.last_epoch < self.warmup_iters:
-            if self.warmup_method == "constant":
-                warmup_factor = self.warmup_factor
-            elif self.warmup_method == "linear":
-                alpha = float(self.last_epoch) / self.warmup_iters
-                warmup_factor = self.warmup_factor * (1 - alpha) + alpha
-        return [
-            base_lr * warmup_factor *
-            self.gamma**bisect_right(self.milestones, self.last_epoch)
-            for base_lr in self.base_lrs
-        ]
-
-
-class WarmupCosineAnnealingLR(torch.optim.lr_scheduler._LRScheduler):
-
-    def __init__(
-        self,
-        optimizer,
-        max_iter,
-        min_lr=0,
-        warmup_factor=1.0 / 3,
-        warmup_iters=500,
-        warmup_method="linear",
-        last_epoch=-1,
-    ):
-        if warmup_method not in ("constant", "linear"):
-            raise ValueError(
-                "Only 'constant' or 'linear' warmup_method accepted"
-                "got {}".format(warmup_method))
-        self.max_iter = max_iter
-        self.min_lr = min_lr
-        self.warmup_factor = warmup_factor
-        self.warmup_iters = warmup_iters
-        self.warmup_method = warmup_method
-        super(WarmupCosineAnnealingLR, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        warmup_factor = 1
-        if self.last_epoch < self.warmup_iters:
-            if self.warmup_method == "constant":
-                warmup_factor = self.warmup_factor
-            elif self.warmup_method == "linear":
-                alpha = self.last_epoch / self.warmup_iters
-                warmup_factor = self.warmup_factor * (1 - alpha) + alpha
-            return [base_lr * warmup_factor for base_lr in self.base_lrs]
-        else:
-            return [
-                self.min_lr + (base_lr - self.min_lr) *
-                (1 + math.cos(math.pi * self.last_epoch / self.max_iter)) / 2
-                for base_lr in self.base_lrs
-            ]
-
-
-class WarmupLinearLR(torch.optim.lr_scheduler._LRScheduler):
-
-    def __init__(
-        self,
-        optimizer,
-        max_iter,
-        min_lr=1e-8,
-        warmup_ratio=0.1,
-        last_epoch=-1,
-    ):
-        self.max_iter = max_iter
-        self.min_lr = min_lr
-        self.warmup_ratio = warmup_ratio
-        self.warmup_iters = int(warmup_ratio * max_iter)
-        super(WarmupLinearLR, self).__init__(optimizer, last_epoch)
-
-    def get_lr_factor(self):
-        tot_step = self.max_iter
-        warmup_step = self.warmup_iters
-        step = self.last_epoch
-        if step < warmup_step:
-            return max(0, step / warmup_step)
-        return max(0, (tot_step - step) / (tot_step - warmup_step))
-
-    def get_lr(self):
-        warmup_factor = self.get_lr_factor()
-        return [
-            max(self.min_lr, base_lr * warmup_factor)
-            for base_lr in self.base_lrs
-        ]
-
-
-"""
-optimizer learning rate scheduling helpers
-
-Copied from ClipBERT
-supports linear/invsqrt/constant/multi_step
-"""
-
-
-def noam_schedule(step, warmup_step=4000):
-    if step <= warmup_step:
-        return step / warmup_step
-    return (warmup_step**0.5) * (step**-0.5)
-
-
-def warmup_linear(step, warmup_step, tot_step):
-    if step < warmup_step:
-        return step / warmup_step
-    return max(0, (tot_step - step) / (tot_step - warmup_step))
-
-
-def multi_step_schedule(n_epoch, milestones, gamma=0.5):
-    milestones = list(sorted(milestones))
-    for i, m in enumerate(milestones):
-        if n_epoch < m:
-            return gamma**i
-    return gamma**(len(milestones) + 1)
-
-
-def get_lr_sched(global_step,
-                 decay,
-                 learning_rate,
-                 num_train_steps,
-                 warmup_ratio=0.1,
-                 decay_epochs=[],
-                 multi_step_epoch=-1):
-    warmup_steps = int(warmup_ratio * num_train_steps)
-    if decay == 'linear':
-        lr_this_step = learning_rate * warmup_linear(global_step, warmup_steps,
-                                                     num_train_steps)
-    elif decay == 'invsqrt':
-        lr_this_step = learning_rate * noam_schedule(global_step, warmup_steps)
-    elif decay == 'constant':
-        lr_this_step = learning_rate
-    elif decay == "multi_step":
-        assert multi_step_epoch >= 0
-        lr_this_step = learning_rate * multi_step_schedule(
-            multi_step_epoch, decay_epochs)
-    if lr_this_step <= 0:
-        # save guard for possible miscalculation of train steps
-        lr_this_step = 1e-8
-    return lr_this_step
diff --git a/AVLFormer/src/solver/optimization.py b/AVLFormer/src/solver/optimization.py
deleted file mode 100644
index 6a91b59..0000000
--- a/AVLFormer/src/solver/optimization.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for BERT model."""
-
-import math
-
-import torch
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR
-
-
-class ConstantLRSchedule(LambdaLR):
-    """ Constant learning rate schedule.
-    """
-
-    def __init__(self, optimizer, last_epoch=-1):
-        super(ConstantLRSchedule, self).__init__(optimizer,
-                                                 lambda _: 1.0,
-                                                 last_epoch=last_epoch)
-
-
-class WarmupConstantSchedule(LambdaLR):
-    """ Linear warmup and then constant.
-        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
-        Keeps learning rate schedule equal to 1. after warmup_steps.
-    """
-
-    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        super(WarmupConstantSchedule, self).__init__(optimizer,
-                                                     self.lr_lambda,
-                                                     last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
-        return 1.
-
-
-class WarmupMultiStepSchedule(LambdaLR):
-    """ Linear warmup and then decrease at multiple steps.
-        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
-        Reduce LR at specific steps by a given ratio after warmup_steps.
-    """
-
-    def __init__(self,
-                 optimizer,
-                 warmup_steps,
-                 decay_steps,
-                 decay_ratio=0.1,
-                 last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.decay_steps = decay_steps
-        self.decay_ratio = decay_ratio
-        super(WarmupMultiStepSchedule, self).__init__(optimizer,
-                                                      self.lr_lambda,
-                                                      last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
-        ratio = 1.0
-        for decay_step in self.decay_steps:
-            if step > decay_step:
-                ratio *= self.decay_ratio
-        return ratio
-
-
-class WarmupLinearSchedule(LambdaLR):
-    """ Linear warmup and then linear decay.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
-    """
-
-    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        super(WarmupLinearSchedule, self).__init__(optimizer,
-                                                   self.lr_lambda,
-                                                   last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        return max(
-            0.0,
-            float(self.t_total - step) /
-            float(max(1.0, self.t_total - self.warmup_steps)))
-
-
-class WarmupCosineSchedule(LambdaLR):
-    """ Linear warmup and then cosine decay.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
-        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    """
-
-    def __init__(self,
-                 optimizer,
-                 warmup_steps,
-                 t_total,
-                 cycles=.5,
-                 last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        self.cycles = cycles
-        super(WarmupCosineSchedule, self).__init__(optimizer,
-                                                   self.lr_lambda,
-                                                   last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
-        # progress after warmup
-        progress = float(step - self.warmup_steps) / float(
-            max(1, self.t_total - self.warmup_steps))
-        return max(
-            0.0, 0.5 *
-            (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
-
-
-class WarmupCosineWithHardRestartsSchedule(LambdaLR):
-    """ Linear warmup and then cosine cycles with hard restarts.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-        learning rate (with hard restarts).
-    """
-
-    def __init__(self,
-                 optimizer,
-                 warmup_steps,
-                 t_total,
-                 cycles=1.,
-                 last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        self.cycles = cycles
-        super(WarmupCosineWithHardRestartsSchedule,
-              self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        # progress after warmup
-        progress = float(step - self.warmup_steps) / float(
-            max(1, self.t_total - self.warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(
-            0.0, 0.5 *
-            (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
-
-
-class AdamW(Optimizer):
-    """ Implements Adam algorithm with weight decay fix.
-
-    Parameters:
-        lr (float): learning rate. Default 1e-3.
-        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
-        eps (float): Adams epsilon. Default: 1e-6
-        weight_decay (float): Weight decay. Default: 0.0
-        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
-    """
-
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 betas=(0.9, 0.999),
-                 eps=1e-6,
-                 weight_decay=0.0,
-                 correct_bias=True):
-        if lr < 0.0:
-            raise ValueError(
-                "Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter: {} - should be in [0.0, 1.0[".format(
-                    betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter: {} - should be in [0.0, 1.0[".format(
-                    betas[1]))
-        if not 0.0 <= eps:
-            raise ValueError(
-                "Invalid epsilon value: {} - should be >= 0.0".format(eps))
-        defaults = dict(lr=lr,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay,
-                        correct_bias=correct_bias)
-        super(AdamW, self).__init__(params, defaults)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        'Adam does not support sparse gradients, please consider SparseAdam instead'
-                    )
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
-
-                step_size = group['lr']
-                if group['correct_bias']:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1**state['step']
-                    bias_correction2 = 1.0 - beta2**state['step']
-                    step_size = step_size * math.sqrt(
-                        bias_correction2) / bias_correction1
-
-                p.data.addcdiv_(exp_avg, denom, value=-step_size)
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                # Add weight decay at the end (fixed version)
-                if group['weight_decay'] > 0.0:
-                    p.data.add_(p.data,
-                                alpha=-group['lr'] * group['weight_decay'])
-
-        return loss
diff --git a/AVLFormer/src/tasks/inference.py b/AVLFormer/src/tasks/inference.py
deleted file mode 100644
index 8409b02..0000000
--- a/AVLFormer/src/tasks/inference.py
+++ /dev/null
@@ -1,376 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import os
-import sys
-
-pythonpath = os.path.abspath(
-    os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-print(pythonpath)
-sys.path.insert(0, pythonpath)
-import io
-import json
-import os.path as op
-import time
-
-from PIL import Image
-import av
-import numpy as np
-from src.configs.config import basic_check_arguments, shared_configs
-from src.datasets.caption_tensorizer import build_tensorizer
-from src.datasets.data_utils.video_ops import extract_frames_from_video_path
-from src.datasets.data_utils.video_transforms import (
-    CenterCrop,
-    Compose,
-    Normalize,
-    Resize,
-)
-from src.datasets.data_utils.volume_transforms import ClipToTensor
-from src.modeling.load_bert import get_bert_model
-from src.modeling.load_passt import MyPasst
-from src.modeling.load_swin import get_swin_model, reload_pretrained_swin
-from src.modeling.video_captioning_e2e_vid_swin_bert import VideoTransformer
-from src.utils.comm import dist_init, get_rank, get_world_size, is_main_process
-from src.utils.deepspeed import fp32_to_fp16
-from src.utils.logger import LOGGER as logger
-from src.utils.logger import TB_LOGGER, RunningMeter, add_log_to_file
-from src.utils.miscellaneous import mkdir, set_seed, str_to_bool
-import torch
-
-
-def _online_video_decode(args, video_path):
-    decoder_num_frames = getattr(args, 'max_num_frames', 2)
-    frames, _ = extract_frames_from_video_path(video_path,
-                                               target_fps=3,
-                                               num_frames=decoder_num_frames,
-                                               multi_thread_decode=False,
-                                               sampling_strategy="uniform",
-                                               safeguard_duration=False,
-                                               start=None,
-                                               end=None)
-    return frames
-
-
-def _transforms(args, frames):
-    raw_video_crop_list = [
-        Resize(args.img_res),
-        CenterCrop((args.img_res, args.img_res)),
-        ClipToTensor(channel_nb=3),
-        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    ]
-    raw_video_prcoess = Compose(raw_video_crop_list)
-
-    frames = frames.numpy()
-    frames = np.transpose(frames, (0, 2, 3, 1))
-    num_of_frames, height, width, channels = frames.shape
-
-    frame_list = []
-    for i in range(args.max_num_frames):
-        frame_list.append(Image.fromarray(frames[i]))
-
-    # apply normalization, output tensor (C x T x H x W) in the range [0, 1.0]
-    crop_frames = raw_video_prcoess(frame_list)
-    # (C x T x H x W) --> (T x C x H x W)
-    crop_frames = crop_frames.permute(1, 0, 2, 3)
-    return crop_frames
-
-
-def check_arguments(args):
-    # shared basic checks
-    basic_check_arguments(args)
-    # additional sanity check:
-    args.max_img_seq_length = int(
-        (args.max_num_frames / 2) * (int(args.img_res) / 32) *
-        (int(args.img_res) / 32)) + 473
-
-    if args.freeze_backbone or args.backbone_coef_lr == 0:
-        args.backbone_coef_lr = 0
-        args.freeze_backbone = True
-
-    if 'reload_pretrained_swin' not in args.keys():
-        args.reload_pretrained_swin = False
-
-    if not len(args.pretrained_checkpoint) and args.reload_pretrained_swin:
-        logger.info(
-            "No pretrained_checkpoint to be loaded, disable --reload_pretrained_swin"
-        )
-        args.reload_pretrained_swin = False
-
-    if args.learn_mask_enabled == True:
-        args.attn_mask_type = 'learn_vid_att'
-
-
-def update_existing_config_for_inference(args):
-    ''' load swinbert args for evaluation and inference 
-    '''
-    assert args.do_test or args.do_eval
-    checkpoint = args.eval_model_dir
-    try:
-        json_path = op.join(checkpoint, os.pardir, 'log', 'args.json')
-        f = open(json_path, 'r')
-        json_data = json.load(f)
-
-        from easydict import EasyDict
-        train_args = EasyDict(json_data)
-    except Exception as e:
-        train_args = torch.load(op.join(checkpoint, 'training_args.bin'))
-
-    train_args.eval_model_dir = args.eval_model_dir
-    train_args.resume_checkpoint = args.eval_model_dir + 'model.bin'
-    train_args.model_name_or_path = 'models/captioning/bert-base-uncased/'
-    train_args.do_train = False
-    train_args.do_eval = True
-    train_args.do_test = True
-    train_args.val_yaml = args.val_yaml
-    train_args.test_video_fname = args.test_video_fname
-    return train_args
-
-
-def decode_mp3(mp3_arr):
-    """
-    decodes an array if uint8 representing an mp3 file
-    :rtype: np.array
-    """
-    container = av.open(io.BytesIO(mp3_arr.tobytes()))
-    stream = next(s for s in container.streams if s.type == 'audio')
-    a = []
-    for _, packet in enumerate(container.demux(stream)):
-        for frame in packet.decode():
-            a.append(frame.to_ndarray().reshape(-1))
-    waveform = np.concatenate(a)
-    if waveform.dtype != 'float32':
-        raise RuntimeError('Unexpected wave type')
-    return waveform
-
-
-def pydub_augment(waveform, gain_augment=7, ir_augment=0):
-    if gain_augment:
-        gain = torch.randint(gain_augment * 2, (1, )).item() - gain_augment
-        amp = 10**(gain / 20)
-        waveform = waveform * amp
-    return waveform
-
-
-def pad_or_truncate(x, audio_length):
-    """Pad all audio to specific length."""
-    if len(x) <= audio_length:
-        return np.concatenate(
-            (x, np.zeros(audio_length - len(x), dtype=np.float32)), axis=0)
-    else:
-        return x[0:audio_length]
-
-
-def get_audio(audio_path):
-    data = np.fromfile(audio_path, dtype='uint8')
-    wave_form = decode_mp3(data)
-    wave_form = pydub_augment(waveform=wave_form)
-    wave_form = pad_or_truncate(x=wave_form, audio_length=10 * 32000)
-
-    return wave_form.reshape(1, -1)
-
-
-def batch_inference(args, video_path, audio_path, model, tokenizer,
-                    tensorizer):
-
-    cls_token_id, sep_token_id, pad_token_id, mask_token_id, period_token_id = \
-        tokenizer.convert_tokens_to_ids([tokenizer.cls_token, tokenizer.sep_token,
-        tokenizer.pad_token, tokenizer.mask_token, '.'])
-
-    model.float()
-    model.eval()
-    for video in os.listdir(video_path):
-        if video.split('.')[-1] == 'mp4':
-            v_path = os.path.join(video_path, video)
-            a_path = os.path.join(audio_path, f'{video[:-3]}mp3')
-            logger.info(f"\n")
-            logger.info(f"Load video: {v_path}")
-
-            frames = _online_video_decode(args, v_path)
-            preproc_frames = _transforms(args, frames)
-            wave_form = get_audio(a_path)
-            data_sample = tensorizer.tensorize_example_e2e('',
-                                                           preproc_frames,
-                                                           wave_form,
-                                                           mode=args.att_mode)
-
-            data_sample = list(data_sample)
-            data_sample[4] = torch.Tensor(data_sample[4])
-
-            data_sample = tuple(t.to(args.device) for t in data_sample)
-
-            with torch.no_grad():
-
-                inputs = {
-                    'is_decode': True,
-                    'input_ids': data_sample[0][None, :],
-                    'attention_mask': data_sample[1][None, :],
-                    'token_type_ids': data_sample[2][None, :],
-                    'img_feats': data_sample[3][None, :],
-                    'audio_feat': data_sample[4][None, :],
-                    'masked_pos': data_sample[5][None, :],
-                    'input_token_ids': data_sample[6][None, :],
-                    'output_token_ids': data_sample[7][None, :],
-                    'do_sample': False,
-                    'bos_token_id': cls_token_id,
-                    'pad_token_id': pad_token_id,
-                    'eos_token_ids': [sep_token_id],
-                    'mask_token_id': mask_token_id,
-                    # for adding od labels
-                    'add_od_labels': args.add_od_labels,
-                    'od_labels_start_posid': args.max_seq_a_length,
-                    # hyperparameters of beam search
-                    'max_length': args.max_gen_length,
-                    'num_beams': args.num_beams,
-                    "temperature": args.temperature,
-                    "top_k": args.top_k,
-                    "top_p": args.top_p,
-                    "repetition_penalty": args.repetition_penalty,
-                    "length_penalty": args.length_penalty,
-                    "num_return_sequences": args.num_return_sequences,
-                    "num_keep_best": args.num_keep_best,
-                }
-                tic = time.time()
-                outputs = model(**inputs)
-
-                time_meter = time.time() - tic
-                all_caps = outputs[0]  # batch_size * num_keep_best * max_len
-                all_confs = torch.exp(outputs[1])
-
-                for caps, confs in zip(all_caps, all_confs):
-                    for cap, conf in zip(caps, confs):
-                        cap = tokenizer.decode(cap.tolist(),
-                                               skip_special_tokens=True)
-                        logger.info(f"Prediction: {cap}")
-                        logger.info(f"Conf: {conf.item()}")
-
-            logger.info(
-                f"Inference model computing time: {time_meter} seconds")
-
-
-def get_custom_args(base_config):
-    parser = base_config.parser
-    parser.add_argument('--max_num_frames', type=int, default=32)
-    parser.add_argument('--img_res', type=int, default=224)
-    parser.add_argument('--patch_size', type=int, default=32)
-    parser.add_argument("--grid_feat",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=True)
-    parser.add_argument("--kinetics",
-                        type=str,
-                        default='400',
-                        help="400 or 600")
-    parser.add_argument("--pretrained_2d",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument("--vidswin_size", type=str, default='base')
-    parser.add_argument('--freeze_backbone',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--use_checkpoint',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--backbone_coef_lr', type=float, default=0.001)
-    parser.add_argument("--reload_pretrained_swin",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--learn_mask_enabled',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--loss_sparse_w', type=float, default=0)
-    parser.add_argument('--sparse_mask_soft2hard',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument(
-        '--transfer_method',
-        type=int,
-        default=-1,
-        help=
-        "0: load all SwinBERT pre-trained weights, 1: load only pre-trained sparse mask"
-    )
-    parser.add_argument(
-        '--att_mask_expansion',
-        type=int,
-        default=-1,
-        help=
-        "-1: random init, 0: random init and then diag-based copy, 1: interpolation"
-    )
-    parser.add_argument('--resume_checkpoint', type=str, default='None')
-    parser.add_argument('--test_video_fname', type=str, default='None')
-    parser.add_argument('--test_audio_fname', type=str, default='None')
-    args = base_config.parse_args()
-    return args
-
-
-def main(args):
-    args_test_audio_fname = args.test_audio_fname
-    args = update_existing_config_for_inference(args)
-    args.test_audio_fname = args_test_audio_fname
-    # global training_saver
-    args.device = torch.device(args.device)
-    # Setup CUDA, GPU & distributed training
-    dist_init(args)
-    check_arguments(args)
-    set_seed(args.seed, args.num_gpus)
-    fp16_trainning = None
-    logger.info("device: {}, n_gpu: {}, rank: {}, "
-                "16-bits training: {}".format(args.device, args.num_gpus,
-                                              get_rank(), fp16_trainning))
-
-    if not is_main_process():
-        logger.disabled = True
-
-    logger.info(f"Pytorch version is: {torch.__version__}")
-    logger.info(f"Cuda version is: {torch.version.cuda}")
-    logger.info(f"cuDNN version is : {torch.backends.cudnn.version()}")
-
-    # Get Passt
-    passt_model = MyPasst()
-    passt_model.freeze()
-    # Get Video Swin model
-    swin_model = get_swin_model(args)
-    # Get BERT and tokenizer
-    bert_model, config, tokenizer = get_bert_model(args)
-    # build SwinBERT based on training configs
-    vl_transformer = VideoTransformer(args, config, swin_model, bert_model,
-                                      passt_model)
-    vl_transformer.freeze_backbone(freeze=args.freeze_backbone)
-
-    # load weights for inference
-    logger.info(f"Loading state dict from checkpoint {args.resume_checkpoint}")
-    cpu_device = torch.device('cpu')
-    pretrained_model = torch.load(args.resume_checkpoint,
-                                  map_location=cpu_device)
-
-    if isinstance(pretrained_model, dict):
-        rst = vl_transformer.load_state_dict(pretrained_model, strict=True)
-    else:
-        rst = vl_transformer.load_state_dict(pretrained_model.state_dict(),
-                                             strict=True)
-
-    logger.info(f'Result of loading weights: {rst}')
-
-    vl_transformer.to(args.device)
-    vl_transformer.eval()
-
-    tensorizer = build_tensorizer(args, tokenizer, is_train=False)
-    batch_inference(args, args.test_video_fname, args.test_audio_fname,
-                    vl_transformer, tokenizer, tensorizer)
-
-
-if __name__ == "__main__":
-    shared_configs.shared_video_captioning_config(cbs=True, scst=True)
-    args = get_custom_args(shared_configs)
-    main(args)
diff --git a/AVLFormer/src/tasks/train.py b/AVLFormer/src/tasks/train.py
deleted file mode 100644
index 041d483..0000000
--- a/AVLFormer/src/tasks/train.py
+++ /dev/null
@@ -1,873 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import os
-import sys
-
-pythonpath = os.path.abspath(
-    os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-print(pythonpath)
-sys.path.insert(0, pythonpath)
-import datetime
-import gc
-import json
-import os
-import os.path as op
-import time
-
-from apex import amp
-from apex.parallel import DistributedDataParallel as DDP
-import deepspeed
-from src.configs.config import (
-    basic_check_arguments,
-    restore_training_settings,
-    shared_configs,
-)
-from src.datasets.vl_dataloader import make_data_loader
-from src.evalcap.utils_caption_evaluate import evaluate_on_coco_caption
-from src.modeling.load_bert import get_bert_model
-from src.modeling.load_passt import MyPasst
-from src.modeling.load_swin import get_swin_model, reload_pretrained_swin
-from src.modeling.video_captioning_e2e_vid_swin_bert import VideoTransformer
-from src.solver import AdamW, WarmupLinearLR
-from src.utils.comm import dist_init, get_rank, get_world_size, is_main_process
-from src.utils.deepspeed import fp32_to_fp16, get_deepspeed_config
-from src.utils.load_save import TrainingRestorer, TrainingSaver
-from src.utils.logger import LOGGER as logger
-from src.utils.logger import TB_LOGGER, RunningMeter, add_log_to_file
-from src.utils.metric_logger import MetricLogger
-from src.utils.miscellaneous import (
-    NoOp,
-    concat_tsv_files,
-    delete_tsv_files,
-    mkdir,
-    set_seed,
-    str_to_bool,
-)
-from src.utils.tsv_file_ops import reorder_tsv_keys, tsv_writer
-import torch
-import torch.distributed as dist
-from tqdm import tqdm
-
-
-def compute_score_with_logits(logits, labels):
-    logits = torch.max(logits, -1)[1].data  # argmax
-    return logits == labels
-
-
-def mixed_precision_init(args, model):
-    max_iter = args.max_iter
-    max_global_step = args.max_global_step
-    global_iters_per_epoch = args.global_iters_per_epoch
-
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-
-    decay_param_tp = [(n, p) for n, p in param_optimizer
-                      if not any(nd in n for nd in no_decay)]
-    no_decay_param_tp = [(n, p) for n, p in param_optimizer
-                         if any(nd in n for nd in no_decay)]
-
-    decay_swin_param_tp = [(n, p) for n, p in decay_param_tp if "swin." in n]
-    decay_bert_param_tp = [(n, p) for n, p in decay_param_tp
-                           if "swin." not in n]
-
-    no_decay_swin_param_tp = [(n, p) for n, p in no_decay_param_tp
-                              if "swin." in n]
-    no_decay_bert_param_tp = [(n, p) for n, p in no_decay_param_tp
-                              if "swin." not in n]
-
-    weight_decay = 0.2
-    coef_lr = args.backbone_coef_lr
-    optimizer_grouped_parameters = [{
-        'params': [p for n, p in decay_swin_param_tp],
-        'weight_decay':
-        weight_decay,
-        'lr':
-        args.learning_rate * coef_lr
-    }, {
-        'params': [p for n, p in decay_bert_param_tp],
-        'weight_decay':
-        weight_decay
-    }, {
-        'params': [p for n, p in no_decay_swin_param_tp],
-        'weight_decay':
-        0.0,
-        'lr':
-        args.learning_rate * coef_lr
-    }, {
-        'params': [p for n, p in no_decay_bert_param_tp],
-        'weight_decay':
-        0.0
-    }]
-
-    if args.mixed_precision_method == "fairscale":
-        from fairscale.optim.oss import OSS
-        optimizer = OSS(params=optimizer_grouped_parameters,
-                        optim=AdamW,
-                        lr=args.learning_rate,
-                        eps=args.adam_epsilon)
-    else:
-        optimizer = AdamW(optimizer_grouped_parameters,
-                          lr=args.learning_rate,
-                          eps=args.adam_epsilon)
-    if args.scheduler == "warmup_linear":
-        scheduler = WarmupLinearLR(optimizer,
-                                   max_global_step,
-                                   warmup_ratio=args.warmup_ratio)
-    else:
-        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
-                                                    step_size=int(max_iter /
-                                                                  2.0),
-                                                    gamma=0.1)
-
-    if args.mixed_precision_method == "deepspeed":
-        config = get_deepspeed_config(args)
-        model, optimizer, _, _ = deepspeed.initialize(config_params=config,
-                                                      model=model,
-                                                      optimizer=optimizer,
-                                                      lr_scheduler=scheduler)
-    elif args.mixed_precision_method == "fairscale":
-        from fairscale.optim.grad_scaler import ShardedGradScaler
-        scaler = ShardedGradScaler()
-        # this is equivalent to deepspeed zero_opt_stage = 2
-        from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
-        model = ShardedDDP(
-            model,
-            optimizer,
-            reduce_buffer_size=0
-            if args.fairscale_fp16 else 2**23,  # 2 ** 23 is the default value
-            reduce_fp16=args.fairscale_fp16)
-    else:
-        # opt_level is O0, Apex will run as fp32
-        model, optimizer = amp.initialize(model,
-                                          optimizer,
-                                          enabled=True,
-                                          opt_level=f'O{args.amp_opt_level}')
-        if args.distributed:  #
-            model = DDP(model)
-    return args, model, optimizer, scheduler
-
-
-def train(args, train_dataloader, val_dataloader, model, tokenizer,
-          training_saver, optimizer, scheduler):
-    meters = MetricLogger(delimiter='  ')
-    max_iter = args.max_iter
-    max_global_step = args.max_global_step
-    global_iters_per_epoch = args.global_iters_per_epoch
-
-    eval_log = []
-    best_score = 0
-    start_training_time = time.time()
-    end = time.time()
-    log_start = time.time()
-    running_loss = RunningMeter('train_loss')
-    running_batch_acc = RunningMeter('train_batch_acc')
-
-    if args.restore_ratio > 0:
-        restorer = TrainingRestorer(args, model, optimizer)
-        global_step = restorer.global_step
-    else:
-        global_step = 0
-
-    TB_LOGGER.global_step = global_step
-    if not is_main_process() or args.restore_ratio <= 0:
-        restorer = NoOp()
-
-    training_saver.save_args(args)
-    training_saver.save_tokenizer(tokenizer)
-
-    for iteration, (img_keys, batch, meta_data) in enumerate(train_dataloader):
-        iteration += 1
-        data_time = time.time() - end
-        batch = tuple(t.to(args.device) for t in batch)
-        model.train()
-        # img_feats (B, #F, C, W, H)
-        inputs = {
-            'input_ids': batch[0],
-            'attention_mask': batch[1],
-            'token_type_ids': batch[2],
-            'img_feats': batch[3],
-            'audio_feat': batch[4],
-            'masked_pos': batch[5],
-            'masked_ids': batch[6],
-            'input_token_ids': batch[7],
-            'output_token_ids': batch[8],
-        }
-
-        if iteration == 1:
-            for k, v in inputs.items():
-                logger.info(f'{k} = {v.shape}')
-
-        if args.deepspeed_fp16:
-            # deepspeed does not autocast inputs
-            inputs = fp32_to_fp16(inputs)
-
-        if args.mixed_precision_method == "fairscale":
-            with torch.cuda.amp.autocast(enabled=True):
-                outputs = model(**inputs)
-        else:
-            outputs = model(**inputs)
-        loss, logits = outputs[:2]
-
-        if args.learn_mask_enabled:
-            loss_sparsity = outputs[-1]
-            loss = loss + (loss_sparsity * args.loss_sparse_w)
-
-        lm_loss, mask_loss = outputs[2], outputs[3]
-
-        batch_score = compute_score_with_logits(logits,
-                                                inputs['output_token_ids'])
-        batch_acc = torch.mean(batch_score.float())
-
-        if args.learn_mask_enabled:
-            loss_dict = {
-                'loss': loss,
-                'loss_sparsity': loss_sparsity.item(),
-                'acc': batch_acc,
-                'lm_loss': lm_loss,
-                'mask_loss': mask_loss
-            }
-        else:
-            loss_dict = {
-                'loss': loss,
-                'acc': batch_acc,
-                'lm_loss': lm_loss,
-                'mask_loss': mask_loss
-            }
-        meters.update(**loss_dict)
-        running_loss(loss.item())
-        running_batch_acc(batch_acc.item())
-
-        # backward pass
-        backward_now = iteration % args.gradient_accumulation_steps == 0
-        if args.mixed_precision_method == "deepspeed":
-            model.backward(loss)
-        elif args.mixed_precision_method == "fairscale":
-            scaler.scale(loss).backward()
-        else:
-            # apex
-            with amp.scale_loss(loss,
-                                optimizer,
-                                delay_unscale=not backward_now) as scaled_loss:
-                scaled_loss.backward()
-        if backward_now:
-            global_step += 1
-            TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step)
-
-            lr_VisBone = optimizer.param_groups[0]["lr"]
-            lr_LM = optimizer.param_groups[1]["lr"]
-
-            TB_LOGGER.add_scalar("train/lr_lm", lr_LM, global_step)
-            TB_LOGGER.add_scalar("train/ls_visBone", lr_VisBone, global_step)
-
-            if args.max_grad_norm != -1:
-                grad_norm = torch.nn.utils.clip_grad_norm_(
-                    amp.master_params(optimizer), args.max_grad_norm)
-                TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step)
-            TB_LOGGER.step()
-            if args.mixed_precision_method == "deepspeed":
-                model.step()
-            elif args.mixed_precision_method == "fairscale":
-                scaler.step(optimizer)
-                scaler.update()
-                scheduler.step()
-                model.zero_grad()
-            else:
-                optimizer.step()
-                scheduler.step()
-                optimizer.zero_grad()
-            restorer.step()
-
-        batch_time = time.time() - end
-
-        if backward_now:
-            if global_step % args.logging_steps == 0 or global_step == max_global_step:
-                if 'time_info' in meters.meters:
-                    avg_time = meters.meters['time_info']['compute'].global_avg
-                    eta_seconds = avg_time * (max_iter - iteration)
-                    eta_string = str(
-                        datetime.timedelta(seconds=int(eta_seconds)))
-                else:
-                    eta_string = 'Unknown'
-                eta_seconds = batch_time * (max_iter - iteration)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                speed = args.num_gpus * args.logging_steps * len(
-                    batch[0]) / (time.time() - log_start)
-                memory = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
-                logger.info(
-                    meters.delimiter.join([
-                        f"eta: {eta_string}",
-                        f"iter: {iteration}",
-                        f"global_step: {global_step}",
-                        f'speed: {speed:.1f} images/sec',
-                        f"{meters}",
-                        f"lr (Visual Encoder): {lr_VisBone:.2e}",
-                        f"lr (LM): {lr_LM:.2e}",
-                        f"max mem: {memory:.0f}",
-                    ]))
-                TB_LOGGER.add_scalar("train/speed", speed, global_step)
-                TB_LOGGER.add_scalar("train/memory", memory, global_step)
-                TB_LOGGER.add_scalar("train/batch_time", batch_time,
-                                     global_step)
-                TB_LOGGER.add_scalar("train/data_time", data_time, global_step)
-                log_start = time.time()
-
-            if (args.save_steps > 0 and global_step % args.save_steps == 0
-                ) or global_step == max_global_step or global_step == 1:
-                epoch = global_step // global_iters_per_epoch
-
-                checkpoint_dir = op.join(
-                    args.output_dir,
-                    'checkpoint-{}-{}'.format(epoch, global_step))
-                if get_world_size() > 1:
-                    dist.barrier()
-                training_saver.save_model(checkpoint_dir, global_step, model,
-                                          optimizer)
-                if get_world_size() > 1:
-                    dist.barrier()
-                if args.evaluate_during_training:
-                    logger.info(
-                        f"Perform evaluation at iteration {iteration}, global_step {global_step}"
-                    )
-                    evaluate_file = evaluate(args, val_dataloader, model,
-                                             tokenizer, checkpoint_dir)
-                    if get_world_size() > 1:
-                        dist.barrier()
-                    if is_main_process():
-                        with open(evaluate_file, 'r') as f:
-                            res = json.load(f)
-                        val_log = {f'valid/{k}': v for k, v in res.items()}
-                        TB_LOGGER.log_scalar_dict(val_log)
-                        # aml_run.log(name='CIDEr', value=float(res['CIDEr']))
-
-                        best_score = max(best_score, res['CIDEr'])
-                        res['epoch'] = epoch
-                        res['iteration'] = iteration
-                        res['best_CIDEr'] = best_score
-                        eval_log.append(res)
-                        with open(
-                                op.join(
-                                    args.output_dir,
-                                    args.val_yaml.replace('/', '_') +
-                                    'eval_logs.json'), 'w') as f:
-                            json.dump(eval_log, f)
-                    if get_world_size() > 1:
-                        dist.barrier()
-
-        if iteration > 2:
-            meters.update(
-                batch_time=batch_time,
-                data_time=data_time,
-            )
-        end = time.time()
-
-        if global_step >= max_global_step and (max_iter - iteration):
-            logger.info(
-                f'Missing {max_iter - iteration} iterations, early break')
-            break
-
-    total_training_time = time.time() - start_training_time
-    total_time_str = str(datetime.timedelta(seconds=total_training_time))
-    logger.info(
-        f'Total training time: {total_time_str} ({(total_training_time / max_iter):.4f} s / iter)'
-    )
-    return checkpoint_dir
-
-
-def get_predict_file(output_dir, args, data_yaml_file):
-    cc = ['pred']
-    # example data_yaml_file: datasets/coco_caption/test.yaml
-    data = data_yaml_file.split('/')[-2]
-    if data != 'coco_caption':
-        cc.append(data)
-    cc.append(op.splitext(op.basename(data_yaml_file))[0])
-    cc.append('beam{}'.format(args.num_beams))
-    cc.append('max{}'.format(args.max_gen_length))
-    if args.num_keep_best != 1:
-        cc.append('best{}'.format(args.num_keep_best))
-    if args.output_hidden_states:
-        cc.append('hidden')
-    return op.join(output_dir, '{}.tsv'.format('.'.join(cc)))
-
-
-def get_evaluate_file(predict_file):
-    assert predict_file.endswith('.tsv')
-    return op.splitext(predict_file)[0] + '.eval.json'
-
-
-def evaluate(args, val_dataloader, model, tokenizer, output_dir):
-    predict_file = get_predict_file(output_dir, args,
-                                    val_dataloader.dataset.yaml_file)
-    test(args, val_dataloader, model, tokenizer, predict_file)
-
-    if get_world_size() > 1:
-        dist.barrier()
-    evaluate_file = get_evaluate_file(predict_file)
-    if is_main_process():
-        caption_file = val_dataloader.dataset.get_caption_file_in_coco_format()
-        data = val_dataloader.dataset.yaml_file.split('/')[-2]
-        result = evaluate_on_coco_caption(predict_file,
-                                          caption_file,
-                                          outfile=evaluate_file)
-        logger.info(f'evaluation result: {str(result)}')
-        logger.info(f'evaluation result saved to {evaluate_file}')
-    if get_world_size() > 1:
-        dist.barrier()
-    return evaluate_file
-
-
-def test(args, test_dataloader, model, tokenizer, predict_file):
-
-    cls_token_id, sep_token_id, pad_token_id, mask_token_id, period_token_id = \
-        tokenizer.convert_tokens_to_ids([tokenizer.cls_token, tokenizer.sep_token,
-        tokenizer.pad_token, tokenizer.mask_token, '.'])
-    world_size = get_world_size()
-    if world_size == 1:
-        cache_file = predict_file
-    else:
-        # local_rank would not work for cross-node distributed training
-        cache_file = op.splitext(predict_file)[0] + '_{}_{}'.format(
-            get_rank(), world_size) + op.splitext(predict_file)[1]
-
-    model.eval()
-
-    def gen_rows():
-        time_meter = 0
-        # restore existing results for long running inference tasks
-        exist_key2pred = {}
-        tmp_file = cache_file + '.tmp.copy'
-        if op.isfile(tmp_file):
-            with open(tmp_file, 'r') as fp:
-                for line in fp:
-                    parts = line.strip().split('\t')
-                    if len(parts) == 2:
-                        exist_key2pred[parts[0]] = parts[1]
-
-        with torch.no_grad():
-            for step, (img_keys, batch,
-                       meta_data) in tqdm(enumerate(test_dataloader)):
-                # torch.cuda.empty_cache()
-                is_exist = True
-                for k in img_keys:
-                    if k not in exist_key2pred:
-                        is_exist = False
-                        break
-                if is_exist:
-                    for k in img_keys:
-                        yield k, exist_key2pred[k]
-                    continue
-                batch = tuple(t.to(args.device) for t in batch)
-                inputs = {
-                    'is_decode': True,
-                    'input_ids': batch[0],
-                    'attention_mask': batch[1],
-                    'token_type_ids': batch[2],
-                    'img_feats': batch[3],
-                    'audio_feat': batch[4],
-                    'masked_pos': batch[5],
-                    'input_token_ids': batch[6],
-                    'output_token_ids': batch[7],
-                    'do_sample': False,
-                    'bos_token_id': cls_token_id,
-                    'pad_token_id': pad_token_id,
-                    'eos_token_ids': [sep_token_id],
-                    'mask_token_id': mask_token_id,
-                    # for adding od labels
-                    'add_od_labels': args.add_od_labels,
-                    'od_labels_start_posid': args.max_seq_a_length,
-                    # hyperparameters of beam search
-                    'max_length': args.max_gen_length,
-                    'num_beams': args.num_beams,
-                    "temperature": args.temperature,
-                    "top_k": args.top_k,
-                    "top_p": args.top_p,
-                    "repetition_penalty": args.repetition_penalty,
-                    "length_penalty": args.length_penalty,
-                    "num_return_sequences": args.num_return_sequences,
-                    "num_keep_best": args.num_keep_best,
-                }
-
-                tic = time.time()
-                # captions, logprobs
-
-                if args.deepspeed_fp16:
-                    # deepspeed does not auto cast inputs.
-                    inputs = fp32_to_fp16(inputs)
-
-                if args.mixed_precision_method == "fairscale":
-                    with torch.cuda.amp.autocast(enabled=True):
-                        outputs = model(**inputs)
-                else:
-                    outputs = model(**inputs)
-                time_meter += time.time() - tic
-                all_caps = outputs[0]  # batch_size * num_keep_best * max_len
-                all_confs = torch.exp(outputs[1])
-
-                for img_key, caps, confs in zip(img_keys, all_caps, all_confs):
-                    res = []
-                    for cap, conf in zip(caps, confs):
-                        cap = tokenizer.decode(cap.tolist(),
-                                               skip_special_tokens=True)
-                        res.append({'caption': cap, 'conf': conf.item()})
-                    if isinstance(img_key, torch.Tensor):
-                        img_key = img_key.item()
-                    yield img_key, json.dumps(res)
-
-        logger.info(
-            f"Inference model computing time: {(time_meter / (step+1))} seconds per batch"
-        )
-
-    tsv_writer(gen_rows(), cache_file)
-    if world_size > 1:
-        dist.barrier()
-    if world_size > 1 and is_main_process():
-        cache_files = [op.splitext(predict_file)[0] + '_{}_{}'.format(i, world_size) + \
-            op.splitext(predict_file)[1] for i in range(world_size)]
-        concat_tsv_files(cache_files, predict_file)
-        delete_tsv_files(cache_files)
-        reorder_tsv_keys(predict_file, test_dataloader.dataset.image_keys,
-                         predict_file)
-    if world_size > 1:
-        dist.barrier()
-
-
-def check_arguments(args):
-    # shared basic checks
-    basic_check_arguments(args)
-    # additional sanity check:
-    args.max_img_seq_length = int(
-        (args.max_num_frames / 2) * (int(args.img_res) / 32) *
-        (int(args.img_res) / 32)) + 473
-
-    if args.freeze_backbone or args.backbone_coef_lr == 0:
-        args.backbone_coef_lr = 0
-        args.freeze_backbone = True
-
-    if 'reload_pretrained_swin' not in args.keys():
-        args.reload_pretrained_swin = False
-
-    if not len(args.pretrained_checkpoint) and args.reload_pretrained_swin:
-        logger.info(
-            "No pretrained_checkpoint to be loaded, disable --reload_pretrained_swin"
-        )
-        args.reload_pretrained_swin = False
-
-    if args.learn_mask_enabled == True:
-        args.attn_mask_type = 'learn_vid_att'
-
-
-def update_existing_config_for_inference(args):
-    ''' load swinbert args for evaluation and inference 
-    '''
-    assert args.do_test or args.do_eval
-    checkpoint = args.eval_model_dir
-    try:
-        json_path = op.join(checkpoint, os.pardir, 'log', 'args.json')
-        f = open(json_path, 'r')
-        json_data = json.load(f)
-
-        from easydict import EasyDict
-        train_args = EasyDict(json_data)
-    except Exception as e:
-        train_args = torch.load(op.join(checkpoint, 'training_args.bin'))
-
-    train_args.eval_model_dir = args.eval_model_dir
-    train_args.resume_checkpoint = args.eval_model_dir + 'model.bin'
-    train_args.model_name_or_path = 'models/captioning/bert-base-uncased/'
-    train_args.do_train = False
-    train_args.do_eval = True
-    train_args.do_test = True
-    train_args.val_yaml = args.val_yaml
-    train_args.test_video_fname = args.test_video_fname
-    return train_args
-
-
-def get_custom_args(base_config):
-    parser = base_config.parser
-    parser.add_argument('--max_num_frames', type=int, default=32)
-    parser.add_argument('--img_res', type=int, default=224)
-    parser.add_argument('--patch_size', type=int, default=32)
-    parser.add_argument("--grid_feat",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=True)
-    parser.add_argument("--kinetics",
-                        type=str,
-                        default='400',
-                        help="400 or 600")
-    parser.add_argument("--att_mode",
-                        type=str,
-                        default='default',
-                        help="default, full")
-    parser.add_argument("--lambda_",
-                        type=float,
-                        default=0.5,
-                        help="lambda_ loss")
-    parser.add_argument("--pretrained_2d",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument("--vidswin_size", type=str, default='base')
-    parser.add_argument('--freeze_backbone',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--freeze_passt',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--use_checkpoint',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--backbone_coef_lr', type=float, default=0.001)
-    parser.add_argument("--reload_pretrained_swin",
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--learn_mask_enabled',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument('--loss_sparse_w', type=float, default=0)
-    parser.add_argument('--sparse_mask_soft2hard',
-                        type=str_to_bool,
-                        nargs='?',
-                        const=True,
-                        default=False)
-    parser.add_argument(
-        '--transfer_method',
-        type=int,
-        default=0,
-        help=
-        "0: load all SwinBERT pre-trained weights, 1: load only pre-trained sparse mask"
-    )
-    parser.add_argument(
-        '--att_mask_expansion',
-        type=int,
-        default=0,
-        help=
-        "-1: random init, 0: random init and then diag-based copy, 1: interpolation"
-    )
-    parser.add_argument('--resume_checkpoint', type=str, default='None')
-    parser.add_argument('--test_video_fname', type=str, default='None')
-    args = base_config.parse_args()
-    return args
-
-
-def main(args):
-    if args.do_train == False or args.do_eval == True:
-        args = update_existing_config_for_inference(args)
-
-    # global training_saver
-    args.device = torch.device(args.device)
-    # Setup CUDA, GPU & distributed training
-    dist_init(args)
-    check_arguments(args)
-    mkdir(args.output_dir)
-    logger.info(f"creating output_dir at: {args.output_dir}")
-    set_seed(args.seed, args.num_gpus)
-
-    if args.mixed_precision_method == "apex":
-        fp16_trainning = f"apex O{args.amp_opt_level}"
-    elif args.mixed_precision_method == "deepspeed":
-        amp_info = '' if args.deepspeed_fp16 else f'amp, {args.amp_opt_level}'
-        fp16_info = '' if not args.deepspeed_fp16 else f'fp16, {args.zero_opt_stage}'
-        fp16_trainning = f"deepspeed, {amp_info}{fp16_info}"
-    elif args.mixed_precision_method == "fairscale":
-        assert args.distributed, "fairscale can only be used for distributed training"
-        fp16_trainning = f"fairscale, fp16: {args.fairscale_fp16}, default zero_opt 2"
-    else:
-        fp16_trainning = None
-
-    logger.info("device: {}, n_gpu: {}, rank: {}, "
-                "16-bits training: {}".format(args.device, args.num_gpus,
-                                              get_rank(), fp16_trainning))
-
-    if not is_main_process():
-        logger.disabled = True
-        training_saver = NoOp()
-    else:
-        training_saver = TrainingSaver(args.output_dir)
-        TB_LOGGER.create(op.join(args.output_dir, 'log'))
-        add_log_to_file(op.join(args.output_dir, 'log', "log.txt"))
-
-    logger.info(f"Pytorch version is: {torch.__version__}")
-    logger.info(f"Cuda version is: {torch.version.cuda}")
-    logger.info(f"cuDNN version is : {torch.backends.cudnn.version()}")
-
-    # Get Passt
-    passt_model = MyPasst()
-    if args.freeze_passt:
-        passt_model.freeze()
-    # Get Video Swin model
-    swin_model = get_swin_model(args)
-    # Get BERT and tokenizer
-    bert_model, config, tokenizer = get_bert_model(args)
-    # build SwinBERT based on training configs
-    vl_transformer = VideoTransformer(args, config, swin_model, bert_model,
-                                      passt_model)
-    vl_transformer.freeze_backbone(freeze=args.freeze_backbone)
-
-    if args.do_eval:
-        # load weights for eval/inference
-        logger.info(
-            f"Loading state dict from checkpoint {args.resume_checkpoint}")
-        cpu_device = torch.device('cpu')
-        pretrained_model = torch.load(args.resume_checkpoint,
-                                      map_location=cpu_device)
-
-        if isinstance(pretrained_model, dict):
-            vl_transformer.load_state_dict(pretrained_model, strict=False)
-        else:
-            vl_transformer.load_state_dict(pretrained_model.state_dict(),
-                                           strict=False)
-
-    elif args.do_train and args.pretrained_checkpoint != '':
-        ckpt_path = args.pretrained_checkpoint + 'model.bin'
-        assert op.exists(ckpt_path), f"{ckpt_path} does not exist"
-        logger.info(f"Loading state dict from checkpoint {ckpt_path}")
-        cpu_device = torch.device('cpu')
-        pretrained_model = torch.load(ckpt_path, map_location=cpu_device)
-
-        if args.learn_mask_enabled == False:
-            if isinstance(pretrained_model, dict):
-                vl_transformer.load_state_dict(pretrained_model, strict=False)
-            else:
-                vl_transformer.load_state_dict(pretrained_model.state_dict(),
-                                               strict=False)
-
-        elif args.learn_mask_enabled == True:
-            pretrained_mask_shape = pretrained_model[
-                'learn_vid_att.weight'].shape
-            init_mask_shape = vl_transformer.learn_vid_att.weight.shape
-
-            #-------------------------------------------------------------
-            # transfer at the same frame rate
-            if pretrained_mask_shape == init_mask_shape:
-                # init using entire pre-trained SwinBERT weights
-                if args.transfer_method == 0:
-                    if isinstance(pretrained_model, dict):
-                        vl_transformer.load_state_dict(pretrained_model,
-                                                       strict=False)
-                    else:
-                        vl_transformer.load_state_dict(
-                            pretrained_model.state_dict(), strict=False)
-                # init using only pre-trained sparse att mask weights
-                else:
-                    vl_transformer.reload_attn_mask(
-                        pretrained_model['learn_vid_att.weight'])
-            #-------------------------------------------------------------
-            # transfer across different frame rates
-            else:
-                # init using entire pre-trained SwinBERT weights, except sparse attn mask
-                if args.transfer_method == 0:
-                    if isinstance(pretrained_model, dict):
-                        new_state_dict = {}
-                        for k, v in zip(pretrained_model.keys(),
-                                        pretrained_model.values()):
-
-                            if k != 'learn_vid_att.weight' or (
-                                    k == 'learn_vid_att.weight' and
-                                    pretrained_mask_shape == init_mask_shape):
-                                new_state_dict[k] = v
-
-                        rst = vl_transformer.load_state_dict(new_state_dict,
-                                                             strict=False)
-                        del new_state_dict
-                    else:
-                        pretrained_model_state_dict = pretrained_model.state_dict(
-                        )
-                        new_state_dict = {}
-                        for k, v in zip(pretrained_model_state_dict.keys(),
-                                        pretrained_model_state_dict.values()):
-                            if k != 'learn_vid_att.weight' or (
-                                    k == 'learn_vid_att.weight' and
-                                    pretrained_mask_shape == init_mask_shape):
-                                new_state_dict[k] = v
-                        rst = vl_transformer.load_state_dict(new_state_dict,
-                                                             strict=False)
-                        del new_state_dict
-
-                    unexpected_key = []
-                    for k in rst[0]:
-                        if 'passt' not in k:
-                            unexpected_key.append(k)
-                    logger.info(
-                        f'load without learn_vid_att, Unexpetced Keys: {unexpected_key}, Missed Key: {rst[1]}'
-                    )
-
-                # expand pre-trained sparse att mask to the desired size
-                if args.att_mask_expansion == 0:
-                    vl_transformer.diag_based_init_attn_mask(
-                        pretrained_model['learn_vid_att.weight'])
-                elif args.att_mask_expansion == 1:
-                    vl_transformer.bilinear_init_attn_mask(
-                        pretrained_model['learn_vid_att.weight'])
-                else:
-                    vl_transformer.random_init_attn_mask()
-
-        del pretrained_model
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        args.eval_model_dir = args.pretrained_checkpoint
-        checkpoint = args.eval_model_dir
-        assert op.isdir(checkpoint)
-        vl_transformer.max_img_seq_length = int(args.max_img_seq_length)
-        vl_transformer.config.num_visual_tokens = int(args.max_img_seq_length)
-        args.model_name_or_path = args.pretrained_checkpoint
-        if args.reload_pretrained_swin:
-            vl_transformer.swin = reload_pretrained_swin(
-                vl_transformer.swin, args)
-
-    vl_transformer.to(args.device)
-
-    if args.do_train:
-        args = restore_training_settings(args)
-        train_dataloader = make_data_loader(args,
-                                            args.train_yaml,
-                                            tokenizer,
-                                            args.distributed,
-                                            is_train=True)
-        val_dataloader = make_data_loader(args,
-                                          args.val_yaml,
-                                          tokenizer,
-                                          args.distributed,
-                                          is_train=False)
-
-        args.max_iter = len(train_dataloader)
-        args.max_global_step = args.max_iter // args.gradient_accumulation_steps
-        args.global_iters_per_epoch = args.max_global_step // args.num_train_epochs
-        args.save_steps = args.global_iters_per_epoch * 3
-
-        args, vl_transformer, optimizer, scheduler = mixed_precision_init(
-            args, vl_transformer)
-        train(args, train_dataloader, val_dataloader, vl_transformer,
-              tokenizer, training_saver, optimizer, scheduler)
-
-    elif args.do_eval:
-        val_dataloader = make_data_loader(args,
-                                          args.val_yaml,
-                                          tokenizer,
-                                          args.distributed,
-                                          is_train=False)
-        args, vl_transformer, _, _ = mixed_precision_init(args, vl_transformer)
-        evaluate_file = evaluate(args, val_dataloader, vl_transformer,
-                                 tokenizer, args.eval_model_dir)
-
-    if args.distributed:
-        dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    shared_configs.shared_video_captioning_config(cbs=True, scst=True)
-    args = get_custom_args(shared_configs)
-    main(args)
diff --git a/AVLFormer/src/timm/__init__.py b/AVLFormer/src/timm/__init__.py
deleted file mode 100644
index 04ec7e5..0000000
--- a/AVLFormer/src/timm/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .version import __version__
-from .models import create_model, list_models, is_model, list_modules, model_entrypoint, \
-    is_scriptable, is_exportable, set_scriptable, set_exportable, has_model_default_key, is_model_default_key, \
-    get_model_default_value, is_model_pretrained
diff --git a/AVLFormer/src/timm/data/__init__.py b/AVLFormer/src/timm/data/__init__.py
deleted file mode 100644
index 7d3cb2b..0000000
--- a/AVLFormer/src/timm/data/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .auto_augment import RandAugment, AutoAugment, rand_augment_ops, auto_augment_policy,\
-    rand_augment_transform, auto_augment_transform
-from .config import resolve_data_config
-from .constants import *
-from .dataset import ImageDataset, IterableImageDataset, AugMixDataset
-from .dataset_factory import create_dataset
-from .loader import create_loader
-from .mixup import Mixup, FastCollateMixup
-from .parsers import create_parser
-from .real_labels import RealLabelsImagenet
-from .transforms import *
-from .transforms_factory import create_transform
\ No newline at end of file
diff --git a/AVLFormer/src/timm/data/auto_augment.py b/AVLFormer/src/timm/data/auto_augment.py
deleted file mode 100644
index cbf5464..0000000
--- a/AVLFormer/src/timm/data/auto_augment.py
+++ /dev/null
@@ -1,817 +0,0 @@
-""" AutoAugment, RandAugment, and AugMix for PyTorch
-
-This code implements the searched ImageNet policies with various tweaks and improvements and
-does not include any of the search code.
-
-AA and RA Implementation adapted from:
-    https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
-
-AugMix adapted from:
-    https://github.com/google-research/augmix
-
-Papers:
-    AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501
-    Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172
-    RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719
-    AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import random
-import math
-import re
-from PIL import Image, ImageOps, ImageEnhance, ImageChops
-import PIL
-import numpy as np
-
-
-_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
-
-_FILL = (128, 128, 128)
-
-# This signifies the max integer that the controller RNN could predict for the
-# augmentation scheme.
-_MAX_LEVEL = 10.
-
-_HPARAMS_DEFAULT = dict(
-    translate_const=250,
-    img_mean=_FILL,
-)
-
-_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
-
-
-def _interpolation(kwargs):
-    interpolation = kwargs.pop('resample', Image.BILINEAR)
-    if isinstance(interpolation, (list, tuple)):
-        return random.choice(interpolation)
-    else:
-        return interpolation
-
-
-def _check_args_tf(kwargs):
-    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
-        kwargs.pop('fillcolor')
-    kwargs['resample'] = _interpolation(kwargs)
-
-
-def shear_x(img, factor, **kwargs):
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
-
-
-def shear_y(img, factor, **kwargs):
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
-
-
-def translate_x_rel(img, pct, **kwargs):
-    pixels = pct * img.size[0]
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
-
-
-def translate_y_rel(img, pct, **kwargs):
-    pixels = pct * img.size[1]
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
-
-
-def translate_x_abs(img, pixels, **kwargs):
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
-
-
-def translate_y_abs(img, pixels, **kwargs):
-    _check_args_tf(kwargs)
-    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
-
-
-def rotate(img, degrees, **kwargs):
-    _check_args_tf(kwargs)
-    if _PIL_VER >= (5, 2):
-        return img.rotate(degrees, **kwargs)
-    elif _PIL_VER >= (5, 0):
-        w, h = img.size
-        post_trans = (0, 0)
-        rotn_center = (w / 2.0, h / 2.0)
-        angle = -math.radians(degrees)
-        matrix = [
-            round(math.cos(angle), 15),
-            round(math.sin(angle), 15),
-            0.0,
-            round(-math.sin(angle), 15),
-            round(math.cos(angle), 15),
-            0.0,
-        ]
-
-        def transform(x, y, matrix):
-            (a, b, c, d, e, f) = matrix
-            return a * x + b * y + c, d * x + e * y + f
-
-        matrix[2], matrix[5] = transform(
-            -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
-        )
-        matrix[2] += rotn_center[0]
-        matrix[5] += rotn_center[1]
-        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
-    else:
-        return img.rotate(degrees, resample=kwargs['resample'])
-
-
-def auto_contrast(img, **__):
-    return ImageOps.autocontrast(img)
-
-
-def invert(img, **__):
-    return ImageOps.invert(img)
-
-
-def equalize(img, **__):
-    return ImageOps.equalize(img)
-
-
-def solarize(img, thresh, **__):
-    return ImageOps.solarize(img, thresh)
-
-
-def solarize_add(img, add, thresh=128, **__):
-    lut = []
-    for i in range(256):
-        if i < thresh:
-            lut.append(min(255, i + add))
-        else:
-            lut.append(i)
-    if img.mode in ("L", "RGB"):
-        if img.mode == "RGB" and len(lut) == 256:
-            lut = lut + lut + lut
-        return img.point(lut)
-    else:
-        return img
-
-
-def posterize(img, bits_to_keep, **__):
-    if bits_to_keep >= 8:
-        return img
-    return ImageOps.posterize(img, bits_to_keep)
-
-
-def contrast(img, factor, **__):
-    return ImageEnhance.Contrast(img).enhance(factor)
-
-
-def color(img, factor, **__):
-    return ImageEnhance.Color(img).enhance(factor)
-
-
-def brightness(img, factor, **__):
-    return ImageEnhance.Brightness(img).enhance(factor)
-
-
-def sharpness(img, factor, **__):
-    return ImageEnhance.Sharpness(img).enhance(factor)
-
-
-def _randomly_negate(v):
-    """With 50% prob, negate the value"""
-    return -v if random.random() > 0.5 else v
-
-
-def _rotate_level_to_arg(level, _hparams):
-    # range [-30, 30]
-    level = (level / _MAX_LEVEL) * 30.
-    level = _randomly_negate(level)
-    return level,
-
-
-def _enhance_level_to_arg(level, _hparams):
-    # range [0.1, 1.9]
-    return (level / _MAX_LEVEL) * 1.8 + 0.1,
-
-
-def _enhance_increasing_level_to_arg(level, _hparams):
-    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
-    # range [0.1, 1.9]
-    level = (level / _MAX_LEVEL) * .9
-    level = 1.0 + _randomly_negate(level)
-    return level,
-
-
-def _shear_level_to_arg(level, _hparams):
-    # range [-0.3, 0.3]
-    level = (level / _MAX_LEVEL) * 0.3
-    level = _randomly_negate(level)
-    return level,
-
-
-def _translate_abs_level_to_arg(level, hparams):
-    translate_const = hparams['translate_const']
-    level = (level / _MAX_LEVEL) * float(translate_const)
-    level = _randomly_negate(level)
-    return level,
-
-
-def _translate_rel_level_to_arg(level, hparams):
-    # default range [-0.45, 0.45]
-    translate_pct = hparams.get('translate_pct', 0.45)
-    level = (level / _MAX_LEVEL) * translate_pct
-    level = _randomly_negate(level)
-    return level,
-
-
-def _posterize_level_to_arg(level, _hparams):
-    # As per Tensorflow TPU EfficientNet impl
-    # range [0, 4], 'keep 0 up to 4 MSB of original image'
-    # intensity/severity of augmentation decreases with level
-    return int((level / _MAX_LEVEL) * 4),
-
-
-def _posterize_increasing_level_to_arg(level, hparams):
-    # As per Tensorflow models research and UDA impl
-    # range [4, 0], 'keep 4 down to 0 MSB of original image',
-    # intensity/severity of augmentation increases with level
-    return 4 - _posterize_level_to_arg(level, hparams)[0],
-
-
-def _posterize_original_level_to_arg(level, _hparams):
-    # As per original AutoAugment paper description
-    # range [4, 8], 'keep 4 up to 8 MSB of image'
-    # intensity/severity of augmentation decreases with level
-    return int((level / _MAX_LEVEL) * 4) + 4,
-
-
-def _solarize_level_to_arg(level, _hparams):
-    # range [0, 256]
-    # intensity/severity of augmentation decreases with level
-    return int((level / _MAX_LEVEL) * 256),
-
-
-def _solarize_increasing_level_to_arg(level, _hparams):
-    # range [0, 256]
-    # intensity/severity of augmentation increases with level
-    return 256 - _solarize_level_to_arg(level, _hparams)[0],
-
-
-def _solarize_add_level_to_arg(level, _hparams):
-    # range [0, 110]
-    return int((level / _MAX_LEVEL) * 110),
-
-
-LEVEL_TO_ARG = {
-    'AutoContrast': None,
-    'Equalize': None,
-    'Invert': None,
-    'Rotate': _rotate_level_to_arg,
-    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
-    'Posterize': _posterize_level_to_arg,
-    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
-    'PosterizeOriginal': _posterize_original_level_to_arg,
-    'Solarize': _solarize_level_to_arg,
-    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
-    'SolarizeAdd': _solarize_add_level_to_arg,
-    'Color': _enhance_level_to_arg,
-    'ColorIncreasing': _enhance_increasing_level_to_arg,
-    'Contrast': _enhance_level_to_arg,
-    'ContrastIncreasing': _enhance_increasing_level_to_arg,
-    'Brightness': _enhance_level_to_arg,
-    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
-    'Sharpness': _enhance_level_to_arg,
-    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
-    'ShearX': _shear_level_to_arg,
-    'ShearY': _shear_level_to_arg,
-    'TranslateX': _translate_abs_level_to_arg,
-    'TranslateY': _translate_abs_level_to_arg,
-    'TranslateXRel': _translate_rel_level_to_arg,
-    'TranslateYRel': _translate_rel_level_to_arg,
-}
-
-
-NAME_TO_OP = {
-    'AutoContrast': auto_contrast,
-    'Equalize': equalize,
-    'Invert': invert,
-    'Rotate': rotate,
-    'Posterize': posterize,
-    'PosterizeIncreasing': posterize,
-    'PosterizeOriginal': posterize,
-    'Solarize': solarize,
-    'SolarizeIncreasing': solarize,
-    'SolarizeAdd': solarize_add,
-    'Color': color,
-    'ColorIncreasing': color,
-    'Contrast': contrast,
-    'ContrastIncreasing': contrast,
-    'Brightness': brightness,
-    'BrightnessIncreasing': brightness,
-    'Sharpness': sharpness,
-    'SharpnessIncreasing': sharpness,
-    'ShearX': shear_x,
-    'ShearY': shear_y,
-    'TranslateX': translate_x_abs,
-    'TranslateY': translate_y_abs,
-    'TranslateXRel': translate_x_rel,
-    'TranslateYRel': translate_y_rel,
-}
-
-
-class AugmentOp:
-
-    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
-        hparams = hparams or _HPARAMS_DEFAULT
-        self.aug_fn = NAME_TO_OP[name]
-        self.level_fn = LEVEL_TO_ARG[name]
-        self.prob = prob
-        self.magnitude = magnitude
-        self.hparams = hparams.copy()
-        self.kwargs = dict(
-            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
-            resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
-        )
-
-        # If magnitude_std is > 0, we introduce some randomness
-        # in the usually fixed policy and sample magnitude from a normal distribution
-        # with mean `magnitude` and std-dev of `magnitude_std`.
-        # NOTE This is my own hack, being tested, not in papers or reference impls.
-        self.magnitude_std = self.hparams.get('magnitude_std', 0)
-
-    def __call__(self, img):
-        if self.prob < 1.0 and random.random() > self.prob:
-            return img
-        magnitude = self.magnitude
-        if self.magnitude_std and self.magnitude_std > 0:
-            magnitude = random.gauss(magnitude, self.magnitude_std)
-        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
-        level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
-        return self.aug_fn(img, *level_args, **self.kwargs)
-
-
-def auto_augment_policy_v0(hparams):
-    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
-    policy = [
-        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
-        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
-        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
-        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
-        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
-        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
-        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
-        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
-        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
-        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
-        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
-        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
-        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
-        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
-        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
-        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
-        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
-        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
-        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
-        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
-        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
-        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
-        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],  # This results in black image with Tpu posterize
-        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
-        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
-    ]
-    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
-    return pc
-
-
-def auto_augment_policy_v0r(hparams):
-    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
-    # in Google research implementation (number of bits discarded increases with magnitude)
-    policy = [
-        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
-        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
-        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
-        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
-        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
-        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
-        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
-        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
-        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
-        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
-        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
-        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
-        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
-        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
-        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
-        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
-        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
-        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
-        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
-        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
-        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
-        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
-        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
-        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
-        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
-    ]
-    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
-    return pc
-
-
-def auto_augment_policy_original(hparams):
-    # ImageNet policy from https://arxiv.org/abs/1805.09501
-    policy = [
-        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
-        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
-        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
-        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
-        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
-        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
-        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
-        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
-        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
-        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
-        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
-        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
-        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
-        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
-        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
-        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
-        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
-        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
-        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
-        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
-        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
-        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
-        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
-        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
-        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
-    ]
-    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
-    return pc
-
-
-def auto_augment_policy_originalr(hparams):
-    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
-    policy = [
-        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
-        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
-        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
-        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
-        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
-        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
-        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
-        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
-        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
-        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
-        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
-        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
-        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
-        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
-        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
-        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
-        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
-        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
-        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
-        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
-        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
-        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
-        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
-        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
-        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
-    ]
-    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
-    return pc
-
-
-def auto_augment_policy(name='v0', hparams=None):
-    hparams = hparams or _HPARAMS_DEFAULT
-    if name == 'original':
-        return auto_augment_policy_original(hparams)
-    elif name == 'originalr':
-        return auto_augment_policy_originalr(hparams)
-    elif name == 'v0':
-        return auto_augment_policy_v0(hparams)
-    elif name == 'v0r':
-        return auto_augment_policy_v0r(hparams)
-    else:
-        assert False, 'Unknown AA policy (%s)' % name
-
-
-class AutoAugment:
-
-    def __init__(self, policy):
-        self.policy = policy
-
-    def __call__(self, img):
-        sub_policy = random.choice(self.policy)
-        for op in sub_policy:
-            img = op(img)
-        return img
-
-
-def auto_augment_transform(config_str, hparams):
-    """
-    Create a AutoAugment transform
-
-    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
-    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
-    The remaining sections, not order sepecific determine
-        'mstd' -  float std deviation of magnitude noise applied
-    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
-
-    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
-
-    :return: A PyTorch compatible Transform
-    """
-    config = config_str.split('-')
-    policy_name = config[0]
-    config = config[1:]
-    for c in config:
-        cs = re.split(r'(\d.*)', c)
-        if len(cs) < 2:
-            continue
-        key, val = cs[:2]
-        if key == 'mstd':
-            # noise param injected via hparams for now
-            hparams.setdefault('magnitude_std', float(val))
-        else:
-            assert False, 'Unknown AutoAugment config section'
-    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
-    return AutoAugment(aa_policy)
-
-
-_RAND_TRANSFORMS = [
-    'AutoContrast',
-    'Equalize',
-    'Invert',
-    'Rotate',
-    'Posterize',
-    'Solarize',
-    'SolarizeAdd',
-    'Color',
-    'Contrast',
-    'Brightness',
-    'Sharpness',
-    'ShearX',
-    'ShearY',
-    'TranslateXRel',
-    'TranslateYRel',
-    #'Cutout'  # NOTE I've implement this as random erasing separately
-]
-
-
-_RAND_INCREASING_TRANSFORMS = [
-    'AutoContrast',
-    'Equalize',
-    'Invert',
-    'Rotate',
-    'PosterizeIncreasing',
-    'SolarizeIncreasing',
-    'SolarizeAdd',
-    'ColorIncreasing',
-    'ContrastIncreasing',
-    'BrightnessIncreasing',
-    'SharpnessIncreasing',
-    'ShearX',
-    'ShearY',
-    'TranslateXRel',
-    'TranslateYRel',
-    #'Cutout'  # NOTE I've implement this as random erasing separately
-]
-
-
-
-# These experimental weights are based loosely on the relative improvements mentioned in paper.
-# They may not result in increased performance, but could likely be tuned to so.
-_RAND_CHOICE_WEIGHTS_0 = {
-    'Rotate': 0.3,
-    'ShearX': 0.2,
-    'ShearY': 0.2,
-    'TranslateXRel': 0.1,
-    'TranslateYRel': 0.1,
-    'Color': .025,
-    'Sharpness': 0.025,
-    'AutoContrast': 0.025,
-    'Solarize': .005,
-    'SolarizeAdd': .005,
-    'Contrast': .005,
-    'Brightness': .005,
-    'Equalize': .005,
-    'Posterize': 0,
-    'Invert': 0,
-}
-
-
-def _select_rand_weights(weight_idx=0, transforms=None):
-    transforms = transforms or _RAND_TRANSFORMS
-    assert weight_idx == 0  # only one set of weights currently
-    rand_weights = _RAND_CHOICE_WEIGHTS_0
-    probs = [rand_weights[k] for k in transforms]
-    probs /= np.sum(probs)
-    return probs
-
-
-def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
-    hparams = hparams or _HPARAMS_DEFAULT
-    transforms = transforms or _RAND_TRANSFORMS
-    return [AugmentOp(
-        name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
-
-
-class RandAugment:
-    def __init__(self, ops, num_layers=2, choice_weights=None):
-        self.ops = ops
-        self.num_layers = num_layers
-        self.choice_weights = choice_weights
-
-    def __call__(self, img):
-        # no replacement when using weighted choice
-        ops = np.random.choice(
-            self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
-        for op in ops:
-            img = op(img)
-        return img
-
-
-def rand_augment_transform(config_str, hparams):
-    """
-    Create a RandAugment transform
-
-    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
-    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
-    sections, not order sepecific determine
-        'm' - integer magnitude of rand augment
-        'n' - integer num layers (number of transform ops selected per image)
-        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
-        'mstd' -  float std deviation of magnitude noise applied
-        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
-    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
-    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
-
-    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
-
-    :return: A PyTorch compatible Transform
-    """
-    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
-    num_layers = 2  # default to 2 ops per image
-    weight_idx = None  # default to no probability weights for op choice
-    transforms = _RAND_TRANSFORMS
-    config = config_str.split('-')
-    assert config[0] == 'rand'
-    config = config[1:]
-    for c in config:
-        cs = re.split(r'(\d.*)', c)
-        if len(cs) < 2:
-            continue
-        key, val = cs[:2]
-        if key == 'mstd':
-            # noise param injected via hparams for now
-            hparams.setdefault('magnitude_std', float(val))
-        elif key == 'inc':
-            if bool(val):
-                transforms = _RAND_INCREASING_TRANSFORMS
-        elif key == 'm':
-            magnitude = int(val)
-        elif key == 'n':
-            num_layers = int(val)
-        elif key == 'w':
-            weight_idx = int(val)
-        else:
-            assert False, 'Unknown RandAugment config section'
-    ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
-    choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
-    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
-
-
-_AUGMIX_TRANSFORMS = [
-    'AutoContrast',
-    'ColorIncreasing',  # not in paper
-    'ContrastIncreasing',  # not in paper
-    'BrightnessIncreasing',  # not in paper
-    'SharpnessIncreasing',  # not in paper
-    'Equalize',
-    'Rotate',
-    'PosterizeIncreasing',
-    'SolarizeIncreasing',
-    'ShearX',
-    'ShearY',
-    'TranslateXRel',
-    'TranslateYRel',
-]
-
-
-def augmix_ops(magnitude=10, hparams=None, transforms=None):
-    hparams = hparams or _HPARAMS_DEFAULT
-    transforms = transforms or _AUGMIX_TRANSFORMS
-    return [AugmentOp(
-        name, prob=1.0, magnitude=magnitude, hparams=hparams) for name in transforms]
-
-
-class AugMixAugment:
-    """ AugMix Transform
-    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
-    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
-    https://arxiv.org/abs/1912.02781
-    """
-    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
-        self.ops = ops
-        self.alpha = alpha
-        self.width = width
-        self.depth = depth
-        self.blended = blended  # blended mode is faster but not well tested
-
-    def _calc_blended_weights(self, ws, m):
-        ws = ws * m
-        cump = 1.
-        rws = []
-        for w in ws[::-1]:
-            alpha = w / cump
-            cump *= (1 - alpha)
-            rws.append(alpha)
-        return np.array(rws[::-1], dtype=np.float32)
-
-    def _apply_blended(self, img, mixing_weights, m):
-        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
-        # of accumulating the mix for each chain in a Numpy array and then blending with original,
-        # it recomputes the blending coefficients and applies one PIL image blend per chain.
-        # TODO the results appear in the right ballpark but they differ by more than rounding.
-        img_orig = img.copy()
-        ws = self._calc_blended_weights(mixing_weights, m)
-        for w in ws:
-            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
-            ops = np.random.choice(self.ops, depth, replace=True)
-            img_aug = img_orig  # no ops are in-place, deep copy not necessary
-            for op in ops:
-                img_aug = op(img_aug)
-            img = Image.blend(img, img_aug, w)
-        return img
-
-    def _apply_basic(self, img, mixing_weights, m):
-        # This is a literal adaptation of the paper/official implementation without normalizations and
-        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
-        # typical augmentation transforms, could use a GPU / Kornia implementation.
-        img_shape = img.size[0], img.size[1], len(img.getbands())
-        mixed = np.zeros(img_shape, dtype=np.float32)
-        for mw in mixing_weights:
-            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
-            ops = np.random.choice(self.ops, depth, replace=True)
-            img_aug = img  # no ops are in-place, deep copy not necessary
-            for op in ops:
-                img_aug = op(img_aug)
-            mixed += mw * np.asarray(img_aug, dtype=np.float32)
-        np.clip(mixed, 0, 255., out=mixed)
-        mixed = Image.fromarray(mixed.astype(np.uint8))
-        return Image.blend(img, mixed, m)
-
-    def __call__(self, img):
-        mixing_weights = np.float32(np.random.dirichlet([self.alpha] * self.width))
-        m = np.float32(np.random.beta(self.alpha, self.alpha))
-        if self.blended:
-            mixed = self._apply_blended(img, mixing_weights, m)
-        else:
-            mixed = self._apply_basic(img, mixing_weights, m)
-        return mixed
-
-
-def augment_and_mix_transform(config_str, hparams):
-    """ Create AugMix PyTorch transform
-
-    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
-    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
-    sections, not order sepecific determine
-        'm' - integer magnitude (severity) of augmentation mix (default: 3)
-        'w' - integer width of augmentation chain (default: 3)
-        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
-        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
-        'mstd' -  float std deviation of magnitude noise applied (default: 0)
-    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
-
-    :param hparams: Other hparams (kwargs) for the Augmentation transforms
-
-    :return: A PyTorch compatible Transform
-    """
-    magnitude = 3
-    width = 3
-    depth = -1
-    alpha = 1.
-    blended = False
-    config = config_str.split('-')
-    assert config[0] == 'augmix'
-    config = config[1:]
-    for c in config:
-        cs = re.split(r'(\d.*)', c)
-        if len(cs) < 2:
-            continue
-        key, val = cs[:2]
-        if key == 'mstd':
-            # noise param injected via hparams for now
-            hparams.setdefault('magnitude_std', float(val))
-        elif key == 'm':
-            magnitude = int(val)
-        elif key == 'w':
-            width = int(val)
-        elif key == 'd':
-            depth = int(val)
-        elif key == 'a':
-            alpha = float(val)
-        elif key == 'b':
-            blended = bool(val)
-        else:
-            assert False, 'Unknown AugMix config section'
-    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
-    return AugMixAugment(ops, alpha=alpha, width=width, depth=depth, blended=blended)
diff --git a/AVLFormer/src/timm/data/config.py b/AVLFormer/src/timm/data/config.py
deleted file mode 100644
index 9cb4bda..0000000
--- a/AVLFormer/src/timm/data/config.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import logging
-from .constants import *
-
-
-_logger = logging.getLogger(__name__)
-
-
-def resolve_data_config(args, default_cfg={}, model=None, verbose=True):
-    new_config = {}
-    default_cfg = default_cfg
-    if not default_cfg and model is not None and hasattr(model, 'default_cfg'):
-        default_cfg = model.default_cfg
-
-    # Resolve input/image size
-    in_chans = 3
-    if 'chans' in args and args['chans'] is not None:
-        in_chans = args['chans']
-
-    input_size = (in_chans, 224, 224)
-    if 'input_size' in args and args['input_size'] is not None:
-        assert isinstance(args['input_size'], (tuple, list))
-        assert len(args['input_size']) == 3
-        input_size = tuple(args['input_size'])
-        in_chans = input_size[0]  # input_size overrides in_chans
-    elif 'img_size' in args and args['img_size'] is not None:
-        assert isinstance(args['img_size'], int)
-        input_size = (in_chans, args['img_size'], args['img_size'])
-    elif 'input_size' in default_cfg:
-        input_size = default_cfg['input_size']
-    new_config['input_size'] = input_size
-
-    # resolve interpolation method
-    new_config['interpolation'] = 'bicubic'
-    if 'interpolation' in args and args['interpolation']:
-        new_config['interpolation'] = args['interpolation']
-    elif 'interpolation' in default_cfg:
-        new_config['interpolation'] = default_cfg['interpolation']
-
-    # resolve dataset + model mean for normalization
-    new_config['mean'] = IMAGENET_DEFAULT_MEAN
-    if 'mean' in args and args['mean'] is not None:
-        mean = tuple(args['mean'])
-        if len(mean) == 1:
-            mean = tuple(list(mean) * in_chans)
-        else:
-            assert len(mean) == in_chans
-        new_config['mean'] = mean
-    elif 'mean' in default_cfg:
-        new_config['mean'] = default_cfg['mean']
-
-    # resolve dataset + model std deviation for normalization
-    new_config['std'] = IMAGENET_DEFAULT_STD
-    if 'std' in args and args['std'] is not None:
-        std = tuple(args['std'])
-        if len(std) == 1:
-            std = tuple(list(std) * in_chans)
-        else:
-            assert len(std) == in_chans
-        new_config['std'] = std
-    elif 'std' in default_cfg:
-        new_config['std'] = default_cfg['std']
-
-    # resolve default crop percentage
-    new_config['crop_pct'] = DEFAULT_CROP_PCT
-    if 'crop_pct' in args and args['crop_pct'] is not None:
-        new_config['crop_pct'] = args['crop_pct']
-    elif 'crop_pct' in default_cfg:
-        new_config['crop_pct'] = default_cfg['crop_pct']
-
-    if verbose:
-        _logger.info('Data processing configuration for current model + dataset:')
-        for n, v in new_config.items():
-            _logger.info('\t%s: %s' % (n, str(v)))
-
-    return new_config
diff --git a/AVLFormer/src/timm/data/constants.py b/AVLFormer/src/timm/data/constants.py
deleted file mode 100644
index d6d4a01..0000000
--- a/AVLFormer/src/timm/data/constants.py
+++ /dev/null
@@ -1,7 +0,0 @@
-DEFAULT_CROP_PCT = 0.875
-IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
-IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
-IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
-IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255)
-IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3)
diff --git a/AVLFormer/src/timm/data/dataset.py b/AVLFormer/src/timm/data/dataset.py
deleted file mode 100644
index a7c5ebe..0000000
--- a/AVLFormer/src/timm/data/dataset.py
+++ /dev/null
@@ -1,145 +0,0 @@
-""" Quick n Simple Image Folder, Tarfile based DataSet
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch.utils.data as data
-import os
-import torch
-import logging
-
-from PIL import Image
-
-from .parsers import create_parser
-
-_logger = logging.getLogger(__name__)
-
-
-_ERROR_RETRY = 50
-
-
-class ImageDataset(data.Dataset):
-
-    def __init__(
-            self,
-            root,
-            parser=None,
-            class_map='',
-            load_bytes=False,
-            transform=None,
-    ):
-        if parser is None or isinstance(parser, str):
-            parser = create_parser(parser or '', root=root, class_map=class_map)
-        self.parser = parser
-        self.load_bytes = load_bytes
-        self.transform = transform
-        self._consecutive_errors = 0
-
-    def __getitem__(self, index):
-        img, target = self.parser[index]
-        try:
-            img = img.read() if self.load_bytes else Image.open(img).convert('RGB')
-        except Exception as e:
-            _logger.warning(f'Skipped sample (index {index}, file {self.parser.filename(index)}). {str(e)}')
-            self._consecutive_errors += 1
-            if self._consecutive_errors < _ERROR_RETRY:
-                return self.__getitem__((index + 1) % len(self.parser))
-            else:
-                raise e
-        self._consecutive_errors = 0
-        if self.transform is not None:
-            img = self.transform(img)
-        if target is None:
-            target = torch.tensor(-1, dtype=torch.long)
-        return img, target
-
-    def __len__(self):
-        return len(self.parser)
-
-    def filename(self, index, basename=False, absolute=False):
-        return self.parser.filename(index, basename, absolute)
-
-    def filenames(self, basename=False, absolute=False):
-        return self.parser.filenames(basename, absolute)
-
-
-class IterableImageDataset(data.IterableDataset):
-
-    def __init__(
-            self,
-            root,
-            parser=None,
-            split='train',
-            is_training=False,
-            batch_size=None,
-            class_map='',
-            load_bytes=False,
-            transform=None,
-    ):
-        assert parser is not None
-        if isinstance(parser, str):
-            self.parser = create_parser(
-                parser, root=root, split=split, is_training=is_training, batch_size=batch_size)
-        else:
-            self.parser = parser
-        self.transform = transform
-        self._consecutive_errors = 0
-
-    def __iter__(self):
-        for img, target in self.parser:
-            if self.transform is not None:
-                img = self.transform(img)
-            if target is None:
-                target = torch.tensor(-1, dtype=torch.long)
-            yield img, target
-
-    def __len__(self):
-        if hasattr(self.parser, '__len__'):
-            return len(self.parser)
-        else:
-            return 0
-
-    def filename(self, index, basename=False, absolute=False):
-        assert False, 'Filename lookup by index not supported, use filenames().'
-
-    def filenames(self, basename=False, absolute=False):
-        return self.parser.filenames(basename, absolute)
-
-
-class AugMixDataset(torch.utils.data.Dataset):
-    """Dataset wrapper to perform AugMix or other clean/augmentation mixes"""
-
-    def __init__(self, dataset, num_splits=2):
-        self.augmentation = None
-        self.normalize = None
-        self.dataset = dataset
-        if self.dataset.transform is not None:
-            self._set_transforms(self.dataset.transform)
-        self.num_splits = num_splits
-
-    def _set_transforms(self, x):
-        assert isinstance(x, (list, tuple)) and len(x) == 3, 'Expecting a tuple/list of 3 transforms'
-        self.dataset.transform = x[0]
-        self.augmentation = x[1]
-        self.normalize = x[2]
-
-    @property
-    def transform(self):
-        return self.dataset.transform
-
-    @transform.setter
-    def transform(self, x):
-        self._set_transforms(x)
-
-    def _normalize(self, x):
-        return x if self.normalize is None else self.normalize(x)
-
-    def __getitem__(self, i):
-        x, y = self.dataset[i]  # all splits share the same dataset base transform
-        x_list = [self._normalize(x)]  # first split only normalizes (this is the 'clean' split)
-        # run the full augmentation on the remaining splits
-        for _ in range(self.num_splits - 1):
-            x_list.append(self._normalize(self.augmentation(x)))
-        return tuple(x_list), y
-
-    def __len__(self):
-        return len(self.dataset)
diff --git a/AVLFormer/src/timm/data/dataset_factory.py b/AVLFormer/src/timm/data/dataset_factory.py
deleted file mode 100644
index b2c9688..0000000
--- a/AVLFormer/src/timm/data/dataset_factory.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-
-from .dataset import IterableImageDataset, ImageDataset
-
-
-def _search_split(root, split):
-    # look for sub-folder with name of split in root and use that if it exists
-    split_name = split.split('[')[0]
-    try_root = os.path.join(root, split_name)
-    if os.path.exists(try_root):
-        return try_root
-    if split_name == 'validation':
-        try_root = os.path.join(root, 'val')
-        if os.path.exists(try_root):
-            return try_root
-    return root
-
-
-def create_dataset(name, root, split='validation', search_split=True, is_training=False, batch_size=None, **kwargs):
-    name = name.lower()
-    if name.startswith('tfds'):
-        ds = IterableImageDataset(
-            root, parser=name, split=split, is_training=is_training, batch_size=batch_size, **kwargs)
-    else:
-        # FIXME support more advance split cfg for ImageFolder/Tar datasets in the future
-        if search_split and os.path.isdir(root):
-            root = _search_split(root, split)
-        ds = ImageDataset(root, parser=name, **kwargs)
-    return ds
diff --git a/AVLFormer/src/timm/data/distributed_sampler.py b/AVLFormer/src/timm/data/distributed_sampler.py
deleted file mode 100644
index 9506a88..0000000
--- a/AVLFormer/src/timm/data/distributed_sampler.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import math
-import torch
-from torch.utils.data import Sampler
-import torch.distributed as dist
-
-
-class OrderedDistributedSampler(Sampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-    It is especially useful in conjunction with
-    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
-    process can pass a DistributedSampler instance as a DataLoader sampler,
-    and load a subset of the original dataset that is exclusive to it.
-    .. note::
-        Dataset is assumed to be of constant size.
-    Arguments:
-        dataset: Dataset used for sampling.
-        num_replicas (optional): Number of processes participating in
-            distributed training.
-        rank (optional): Rank of the current process within num_replicas.
-    """
-
-    def __init__(self, dataset, num_replicas=None, rank=None):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
-        self.total_size = self.num_samples * self.num_replicas
-
-    def __iter__(self):
-        indices = list(range(len(self.dataset)))
-
-        # add extra samples to make it evenly divisible
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-
-        # subsample
-        indices = indices[self.rank:self.total_size:self.num_replicas]
-        assert len(indices) == self.num_samples
-
-        return iter(indices)
-
-    def __len__(self):
-        return self.num_samples
diff --git a/AVLFormer/src/timm/data/loader.py b/AVLFormer/src/timm/data/loader.py
deleted file mode 100644
index 7614466..0000000
--- a/AVLFormer/src/timm/data/loader.py
+++ /dev/null
@@ -1,262 +0,0 @@
-""" Loader Factory, Fast Collate, CUDA Prefetcher
-
-Prefetcher and Fast Collate inspired by NVIDIA APEX example at
-https://github.com/NVIDIA/apex/commit/d5e2bb4bdeedd27b1dfaf5bb2b24d6c000dee9be#diff-cf86c282ff7fba81fad27a559379d5bf
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch.utils.data
-import numpy as np
-
-from .transforms_factory import create_transform
-from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .distributed_sampler import OrderedDistributedSampler
-from .random_erasing import RandomErasing
-from .mixup import FastCollateMixup
-
-
-def fast_collate(batch):
-    """ A fast collation function optimized for uint8 images (np array or torch) and int64 targets (labels)"""
-    assert isinstance(batch[0], tuple)
-    batch_size = len(batch)
-    if isinstance(batch[0][0], tuple):
-        # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position
-        # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position
-        inner_tuple_size = len(batch[0][0])
-        flattened_batch_size = batch_size * inner_tuple_size
-        targets = torch.zeros(flattened_batch_size, dtype=torch.int64)
-        tensor = torch.zeros((flattened_batch_size, *batch[0][0][0].shape), dtype=torch.uint8)
-        for i in range(batch_size):
-            assert len(batch[i][0]) == inner_tuple_size  # all input tensor tuples must be same length
-            for j in range(inner_tuple_size):
-                targets[i + j * batch_size] = batch[i][1]
-                tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j])
-        return tensor, targets
-    elif isinstance(batch[0][0], np.ndarray):
-        targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
-        assert len(targets) == batch_size
-        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
-        for i in range(batch_size):
-            tensor[i] += torch.from_numpy(batch[i][0])
-        return tensor, targets
-    elif isinstance(batch[0][0], torch.Tensor):
-        targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
-        assert len(targets) == batch_size
-        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
-        for i in range(batch_size):
-            tensor[i].copy_(batch[i][0])
-        return tensor, targets
-    else:
-        assert False
-
-
-class PrefetchLoader:
-
-    def __init__(self,
-                 loader,
-                 mean=IMAGENET_DEFAULT_MEAN,
-                 std=IMAGENET_DEFAULT_STD,
-                 fp16=False,
-                 re_prob=0.,
-                 re_mode='const',
-                 re_count=1,
-                 re_num_splits=0):
-        self.loader = loader
-        self.mean = torch.tensor([x * 255 for x in mean]).cuda().view(1, 3, 1, 1)
-        self.std = torch.tensor([x * 255 for x in std]).cuda().view(1, 3, 1, 1)
-        self.fp16 = fp16
-        if fp16:
-            self.mean = self.mean.half()
-            self.std = self.std.half()
-        if re_prob > 0.:
-            self.random_erasing = RandomErasing(
-                probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits)
-        else:
-            self.random_erasing = None
-
-    def __iter__(self):
-        stream = torch.cuda.Stream()
-        first = True
-
-        for next_input, next_target in self.loader:
-            with torch.cuda.stream(stream):
-                next_input = next_input.cuda(non_blocking=True)
-                next_target = next_target.cuda(non_blocking=True)
-                if self.fp16:
-                    next_input = next_input.half().sub_(self.mean).div_(self.std)
-                else:
-                    next_input = next_input.float().sub_(self.mean).div_(self.std)
-                if self.random_erasing is not None:
-                    next_input = self.random_erasing(next_input)
-
-            if not first:
-                yield input, target
-            else:
-                first = False
-
-            torch.cuda.current_stream().wait_stream(stream)
-            input = next_input
-            target = next_target
-
-        yield input, target
-
-    def __len__(self):
-        return len(self.loader)
-
-    @property
-    def sampler(self):
-        return self.loader.sampler
-
-    @property
-    def dataset(self):
-        return self.loader.dataset
-
-    @property
-    def mixup_enabled(self):
-        if isinstance(self.loader.collate_fn, FastCollateMixup):
-            return self.loader.collate_fn.mixup_enabled
-        else:
-            return False
-
-    @mixup_enabled.setter
-    def mixup_enabled(self, x):
-        if isinstance(self.loader.collate_fn, FastCollateMixup):
-            self.loader.collate_fn.mixup_enabled = x
-
-
-def create_loader(
-        dataset,
-        input_size,
-        batch_size,
-        is_training=False,
-        use_prefetcher=True,
-        no_aug=False,
-        re_prob=0.,
-        re_mode='const',
-        re_count=1,
-        re_split=False,
-        scale=None,
-        ratio=None,
-        hflip=0.5,
-        vflip=0.,
-        color_jitter=0.4,
-        auto_augment=None,
-        num_aug_splits=0,
-        interpolation='bilinear',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        num_workers=1,
-        distributed=False,
-        crop_pct=None,
-        collate_fn=None,
-        pin_memory=False,
-        fp16=False,
-        tf_preprocessing=False,
-        use_multi_epochs_loader=False,
-        persistent_workers=True,
-):
-    re_num_splits = 0
-    if re_split:
-        # apply RE to second half of batch if no aug split otherwise line up with aug split
-        re_num_splits = num_aug_splits or 2
-    dataset.transform = create_transform(
-        input_size,
-        is_training=is_training,
-        use_prefetcher=use_prefetcher,
-        no_aug=no_aug,
-        scale=scale,
-        ratio=ratio,
-        hflip=hflip,
-        vflip=vflip,
-        color_jitter=color_jitter,
-        auto_augment=auto_augment,
-        interpolation=interpolation,
-        mean=mean,
-        std=std,
-        crop_pct=crop_pct,
-        tf_preprocessing=tf_preprocessing,
-        re_prob=re_prob,
-        re_mode=re_mode,
-        re_count=re_count,
-        re_num_splits=re_num_splits,
-        separate=num_aug_splits > 0,
-    )
-
-    sampler = None
-    if distributed and not isinstance(dataset, torch.utils.data.IterableDataset):
-        if is_training:
-            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        else:
-            # This will add extra duplicate entries to result in equal num
-            # of samples per-process, will slightly alter validation results
-            sampler = OrderedDistributedSampler(dataset)
-
-    if collate_fn is None:
-        collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate
-
-    loader_class = torch.utils.data.DataLoader
-
-    if use_multi_epochs_loader:
-        loader_class = MultiEpochsDataLoader
-
-    loader_args = dict(
-        batch_size=batch_size,
-        shuffle=not isinstance(dataset, torch.utils.data.IterableDataset) and sampler is None and is_training,
-        num_workers=num_workers,
-        sampler=sampler,
-        collate_fn=collate_fn,
-        pin_memory=pin_memory,
-        drop_last=is_training,
-        persistent_workers=persistent_workers)
-    try:
-        loader = loader_class(dataset, **loader_args)
-    except TypeError as e:
-        loader_args.pop('persistent_workers')  # only in Pytorch 1.7+
-        loader = loader_class(dataset, **loader_args)
-    if use_prefetcher:
-        prefetch_re_prob = re_prob if is_training and not no_aug else 0.
-        loader = PrefetchLoader(
-            loader,
-            mean=mean,
-            std=std,
-            fp16=fp16,
-            re_prob=prefetch_re_prob,
-            re_mode=re_mode,
-            re_count=re_count,
-            re_num_splits=re_num_splits
-        )
-
-    return loader
-
-
-class MultiEpochsDataLoader(torch.utils.data.DataLoader):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._DataLoader__initialized = False
-        self.batch_sampler = _RepeatSampler(self.batch_sampler)
-        self._DataLoader__initialized = True
-        self.iterator = super().__iter__()
-
-    def __len__(self):
-        return len(self.batch_sampler.sampler)
-
-    def __iter__(self):
-        for i in range(len(self)):
-            yield next(self.iterator)
-
-
-class _RepeatSampler(object):
-    """ Sampler that repeats forever.
-
-    Args:
-        sampler (Sampler)
-    """
-
-    def __init__(self, sampler):
-        self.sampler = sampler
-
-    def __iter__(self):
-        while True:
-            yield from iter(self.sampler)
diff --git a/AVLFormer/src/timm/data/mixup.py b/AVLFormer/src/timm/data/mixup.py
deleted file mode 100644
index 3847754..0000000
--- a/AVLFormer/src/timm/data/mixup.py
+++ /dev/null
@@ -1,316 +0,0 @@
-""" Mixup and Cutmix
-
-Papers:
-mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412)
-
-CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899)
-
-Code Reference:
-CutMix: https://github.com/clovaai/CutMix-PyTorch
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import numpy as np
-import torch
-
-
-def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'):
-    x = x.long().view(-1, 1)
-    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
-
-
-def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'):
-    off_value = smoothing / num_classes
-    on_value = 1. - smoothing + off_value
-    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
-    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
-    return y1 * lam + y2 * (1. - lam)
-
-
-def rand_bbox(img_shape, lam, margin=0., count=None):
-    """ Standard CutMix bounding-box
-    Generates a random square bbox based on lambda value. This impl includes
-    support for enforcing a border margin as percent of bbox dimensions.
-
-    Args:
-        img_shape (tuple): Image shape as tuple
-        lam (float): Cutmix lambda value
-        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
-        count (int): Number of bbox to generate
-    """
-    ratio = np.sqrt(1 - lam)
-    img_h, img_w = img_shape[-2:]
-    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
-    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
-    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
-    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
-    yl = np.clip(cy - cut_h // 2, 0, img_h)
-    yh = np.clip(cy + cut_h // 2, 0, img_h)
-    xl = np.clip(cx - cut_w // 2, 0, img_w)
-    xh = np.clip(cx + cut_w // 2, 0, img_w)
-    return yl, yh, xl, xh
-
-
-def rand_bbox_minmax(img_shape, minmax, count=None):
-    """ Min-Max CutMix bounding-box
-    Inspired by Darknet cutmix impl, generates a random rectangular bbox
-    based on min/max percent values applied to each dimension of the input image.
-
-    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
-
-    Args:
-        img_shape (tuple): Image shape as tuple
-        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
-        count (int): Number of bbox to generate
-    """
-    assert len(minmax) == 2
-    img_h, img_w = img_shape[-2:]
-    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
-    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
-    yl = np.random.randint(0, img_h - cut_h, size=count)
-    xl = np.random.randint(0, img_w - cut_w, size=count)
-    yu = yl + cut_h
-    xu = xl + cut_w
-    return yl, yu, xl, xu
-
-
-def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
-    """ Generate bbox and apply lambda correction.
-    """
-    if ratio_minmax is not None:
-        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
-    else:
-        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
-    if correct_lam or ratio_minmax is not None:
-        bbox_area = (yu - yl) * (xu - xl)
-        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
-    return (yl, yu, xl, xu), lam
-
-
-class Mixup:
-    """ Mixup/Cutmix that applies different params to each element or whole batch
-
-    Args:
-        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
-        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
-        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
-        prob (float): probability of applying mixup or cutmix per batch or element
-        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
-        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
-        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
-        label_smoothing (float): apply label smoothing to the mixed target tensor
-        num_classes (int): number of classes for target
-    """
-    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
-                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
-        self.mixup_alpha = mixup_alpha
-        self.cutmix_alpha = cutmix_alpha
-        self.cutmix_minmax = cutmix_minmax
-        if self.cutmix_minmax is not None:
-            assert len(self.cutmix_minmax) == 2
-            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
-            self.cutmix_alpha = 1.0
-        self.mix_prob = prob
-        self.switch_prob = switch_prob
-        self.label_smoothing = label_smoothing
-        self.num_classes = num_classes
-        self.mode = mode
-        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
-        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
-
-    def _params_per_elem(self, batch_size):
-        lam = np.ones(batch_size, dtype=np.float32)
-        use_cutmix = np.zeros(batch_size, dtype=np.bool)
-        if self.mixup_enabled:
-            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
-                use_cutmix = np.random.rand(batch_size) < self.switch_prob
-                lam_mix = np.where(
-                    use_cutmix,
-                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
-                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
-            elif self.mixup_alpha > 0.:
-                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
-            elif self.cutmix_alpha > 0.:
-                use_cutmix = np.ones(batch_size, dtype=np.bool)
-                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
-            else:
-                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
-            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
-        return lam, use_cutmix
-
-    def _params_per_batch(self):
-        lam = 1.
-        use_cutmix = False
-        if self.mixup_enabled and np.random.rand() < self.mix_prob:
-            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
-                use_cutmix = np.random.rand() < self.switch_prob
-                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
-                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
-            elif self.mixup_alpha > 0.:
-                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
-            elif self.cutmix_alpha > 0.:
-                use_cutmix = True
-                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
-            else:
-                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
-            lam = float(lam_mix)
-        return lam, use_cutmix
-
-    def _mix_elem(self, x):
-        batch_size = len(x)
-        lam_batch, use_cutmix = self._params_per_elem(batch_size)
-        x_orig = x.clone()  # need to keep an unmodified original for mixing source
-        for i in range(batch_size):
-            j = batch_size - i - 1
-            lam = lam_batch[i]
-            if lam != 1.:
-                if use_cutmix[i]:
-                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
-                    lam_batch[i] = lam
-                else:
-                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
-        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
-
-    def _mix_pair(self, x):
-        batch_size = len(x)
-        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
-        x_orig = x.clone()  # need to keep an unmodified original for mixing source
-        for i in range(batch_size // 2):
-            j = batch_size - i - 1
-            lam = lam_batch[i]
-            if lam != 1.:
-                if use_cutmix[i]:
-                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
-                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
-                    lam_batch[i] = lam
-                else:
-                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
-                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
-        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
-        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
-
-    def _mix_batch(self, x):
-        lam, use_cutmix = self._params_per_batch()
-        if lam == 1.:
-            return 1.
-        if use_cutmix:
-            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
-        else:
-            x_flipped = x.flip(0).mul_(1. - lam)
-            x.mul_(lam).add_(x_flipped)
-        return lam
-
-    def __call__(self, x, target):
-        assert len(x) % 2 == 0, 'Batch size should be even when using this'
-        if self.mode == 'elem':
-            lam = self._mix_elem(x)
-        elif self.mode == 'pair':
-            lam = self._mix_pair(x)
-        else:
-            lam = self._mix_batch(x)
-        target = mixup_target(target, self.num_classes, lam, self.label_smoothing)
-        return x, target
-
-
-class FastCollateMixup(Mixup):
-    """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch
-
-    A Mixup impl that's performed while collating the batches.
-    """
-
-    def _mix_elem_collate(self, output, batch, half=False):
-        batch_size = len(batch)
-        num_elem = batch_size // 2 if half else batch_size
-        assert len(output) == num_elem
-        lam_batch, use_cutmix = self._params_per_elem(num_elem)
-        for i in range(num_elem):
-            j = batch_size - i - 1
-            lam = lam_batch[i]
-            mixed = batch[i][0]
-            if lam != 1.:
-                if use_cutmix[i]:
-                    if not half:
-                        mixed = mixed.copy()
-                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
-                    lam_batch[i] = lam
-                else:
-                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    np.rint(mixed, out=mixed)
-            output[i] += torch.from_numpy(mixed.astype(np.uint8))
-        if half:
-            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
-        return torch.tensor(lam_batch).unsqueeze(1)
-
-    def _mix_pair_collate(self, output, batch):
-        batch_size = len(batch)
-        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
-        for i in range(batch_size // 2):
-            j = batch_size - i - 1
-            lam = lam_batch[i]
-            mixed_i = batch[i][0]
-            mixed_j = batch[j][0]
-            assert 0 <= lam <= 1.0
-            if lam < 1.:
-                if use_cutmix[i]:
-                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
-                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
-                    mixed_j[:, yl:yh, xl:xh] = patch_i
-                    lam_batch[i] = lam
-                else:
-                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
-                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
-                    mixed_i = mixed_temp
-                    np.rint(mixed_j, out=mixed_j)
-                    np.rint(mixed_i, out=mixed_i)
-            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
-            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
-        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
-        return torch.tensor(lam_batch).unsqueeze(1)
-
-    def _mix_batch_collate(self, output, batch):
-        batch_size = len(batch)
-        lam, use_cutmix = self._params_per_batch()
-        if use_cutmix:
-            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-        for i in range(batch_size):
-            j = batch_size - i - 1
-            mixed = batch[i][0]
-            if lam != 1.:
-                if use_cutmix:
-                    mixed = mixed.copy()  # don't want to modify the original while iterating
-                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
-                else:
-                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    np.rint(mixed, out=mixed)
-            output[i] += torch.from_numpy(mixed.astype(np.uint8))
-        return lam
-
-    def __call__(self, batch, _=None):
-        batch_size = len(batch)
-        assert batch_size % 2 == 0, 'Batch size should be even when using this'
-        half = 'half' in self.mode
-        if half:
-            batch_size //= 2
-        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
-        if self.mode == 'elem' or self.mode == 'half':
-            lam = self._mix_elem_collate(output, batch, half=half)
-        elif self.mode == 'pair':
-            lam = self._mix_pair_collate(output, batch)
-        else:
-            lam = self._mix_batch_collate(output, batch)
-        target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
-        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu')
-        target = target[:batch_size]
-        return output, target
-
diff --git a/AVLFormer/src/timm/data/parsers/__init__.py b/AVLFormer/src/timm/data/parsers/__init__.py
deleted file mode 100644
index eeb44e3..0000000
--- a/AVLFormer/src/timm/data/parsers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .parser_factory import create_parser
diff --git a/AVLFormer/src/timm/data/parsers/class_map.py b/AVLFormer/src/timm/data/parsers/class_map.py
deleted file mode 100644
index 9ef4d1f..0000000
--- a/AVLFormer/src/timm/data/parsers/class_map.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-
-
-def load_class_map(filename, root=''):
-    class_map_path = filename
-    if not os.path.exists(class_map_path):
-        class_map_path = os.path.join(root, filename)
-        assert os.path.exists(class_map_path), 'Cannot locate specified class map file (%s)' % filename
-    class_map_ext = os.path.splitext(filename)[-1].lower()
-    if class_map_ext == '.txt':
-        with open(class_map_path) as f:
-            class_to_idx = {v.strip(): k for k, v in enumerate(f)}
-    else:
-        assert False, 'Unsupported class map extension'
-    return class_to_idx
-
diff --git a/AVLFormer/src/timm/data/parsers/constants.py b/AVLFormer/src/timm/data/parsers/constants.py
deleted file mode 100644
index e7ba484..0000000
--- a/AVLFormer/src/timm/data/parsers/constants.py
+++ /dev/null
@@ -1 +0,0 @@
-IMG_EXTENSIONS = ('.png', '.jpg', '.jpeg')
diff --git a/AVLFormer/src/timm/data/parsers/parser.py b/AVLFormer/src/timm/data/parsers/parser.py
deleted file mode 100644
index 76ab6d1..0000000
--- a/AVLFormer/src/timm/data/parsers/parser.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from abc import abstractmethod
-
-
-class Parser:
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def _filename(self, index, basename=False, absolute=False):
-        pass
-
-    def filename(self, index, basename=False, absolute=False):
-        return self._filename(index, basename=basename, absolute=absolute)
-
-    def filenames(self, basename=False, absolute=False):
-        return [self._filename(index, basename=basename, absolute=absolute) for index in range(len(self))]
-
diff --git a/AVLFormer/src/timm/data/parsers/parser_factory.py b/AVLFormer/src/timm/data/parsers/parser_factory.py
deleted file mode 100644
index 419ffe8..0000000
--- a/AVLFormer/src/timm/data/parsers/parser_factory.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-
-from .parser_image_folder import ParserImageFolder
-from .parser_image_tar import ParserImageTar
-from .parser_image_in_tar import ParserImageInTar
-
-
-def create_parser(name, root, split='train', **kwargs):
-    name = name.lower()
-    name = name.split('/', 2)
-    prefix = ''
-    if len(name) > 1:
-        prefix = name[0]
-    name = name[-1]
-
-    # FIXME improve the selection right now just tfds prefix or fallback path, will need options to
-    # explicitly select other options shortly
-    if prefix == 'tfds':
-        from .parser_tfds import ParserTfds  # defer tensorflow import
-        parser = ParserTfds(root, name, split=split, shuffle=kwargs.pop('shuffle', False), **kwargs)
-    else:
-        assert os.path.exists(root)
-        # default fallback path (backwards compat), use image tar if root is a .tar file, otherwise image folder
-        # FIXME support split here, in parser?
-        if os.path.isfile(root) and os.path.splitext(root)[1] == '.tar':
-            parser = ParserImageInTar(root, **kwargs)
-        else:
-            parser = ParserImageFolder(root, **kwargs)
-    return parser
diff --git a/AVLFormer/src/timm/data/parsers/parser_image_folder.py b/AVLFormer/src/timm/data/parsers/parser_image_folder.py
deleted file mode 100644
index d08cde8..0000000
--- a/AVLFormer/src/timm/data/parsers/parser_image_folder.py
+++ /dev/null
@@ -1,69 +0,0 @@
-""" A dataset parser that reads images from folders
-
-Folders are scannerd recursively to find image files. Labels are based
-on the folder hierarchy, just leaf folders by default.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import os
-
-from src.timm.utils.misc import natural_key
-
-from .parser import Parser
-from .class_map import load_class_map
-from .constants import IMG_EXTENSIONS
-
-
-def find_images_and_targets(folder, types=IMG_EXTENSIONS, class_to_idx=None, leaf_name_only=True, sort=True):
-    labels = []
-    filenames = []
-    for root, subdirs, files in os.walk(folder, topdown=False, followlinks=True):
-        rel_path = os.path.relpath(root, folder) if (root != folder) else ''
-        label = os.path.basename(rel_path) if leaf_name_only else rel_path.replace(os.path.sep, '_')
-        for f in files:
-            base, ext = os.path.splitext(f)
-            if ext.lower() in types:
-                filenames.append(os.path.join(root, f))
-                labels.append(label)
-    if class_to_idx is None:
-        # building class index
-        unique_labels = set(labels)
-        sorted_labels = list(sorted(unique_labels, key=natural_key))
-        class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)}
-    images_and_targets = [(f, class_to_idx[l]) for f, l in zip(filenames, labels) if l in class_to_idx]
-    if sort:
-        images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k[0]))
-    return images_and_targets, class_to_idx
-
-
-class ParserImageFolder(Parser):
-
-    def __init__(
-            self,
-            root,
-            class_map=''):
-        super().__init__()
-
-        self.root = root
-        class_to_idx = None
-        if class_map:
-            class_to_idx = load_class_map(class_map, root)
-        self.samples, self.class_to_idx = find_images_and_targets(root, class_to_idx=class_to_idx)
-        if len(self.samples) == 0:
-            raise RuntimeError(
-                f'Found 0 images in subfolders of {root}. Supported image extensions are {", ".join(IMG_EXTENSIONS)}')
-
-    def __getitem__(self, index):
-        path, target = self.samples[index]
-        return open(path, 'rb'), target
-
-    def __len__(self):
-        return len(self.samples)
-
-    def _filename(self, index, basename=False, absolute=False):
-        filename = self.samples[index][0]
-        if basename:
-            filename = os.path.basename(filename)
-        elif not absolute:
-            filename = os.path.relpath(filename, self.root)
-        return filename
diff --git a/AVLFormer/src/timm/data/parsers/parser_image_in_tar.py b/AVLFormer/src/timm/data/parsers/parser_image_in_tar.py
deleted file mode 100644
index 419bf10..0000000
--- a/AVLFormer/src/timm/data/parsers/parser_image_in_tar.py
+++ /dev/null
@@ -1,222 +0,0 @@
-""" A dataset parser that reads tarfile based datasets
-
-This parser can read and extract image samples from:
-* a single tar of image files
-* a folder of multiple tarfiles containing imagefiles
-* a tar of tars containing image files
-
-Labels are based on the combined folder and/or tar name structure.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import os
-import tarfile
-import pickle
-import logging
-import numpy as np
-from glob import glob
-from typing import List, Dict
-
-from src.timm.utils.misc import natural_key
-
-from .parser import Parser
-from .class_map import load_class_map
-from .constants import IMG_EXTENSIONS
-
-
-_logger = logging.getLogger(__name__)
-CACHE_FILENAME_SUFFIX = '_tarinfos.pickle'
-
-
-class TarState:
-
-    def __init__(self, tf: tarfile.TarFile = None, ti: tarfile.TarInfo = None):
-        self.tf: tarfile.TarFile = tf
-        self.ti: tarfile.TarInfo = ti
-        self.children: Dict[str, TarState] = {}  # child states (tars within tars)
-
-    def reset(self):
-        self.tf = None
-
-
-def _extract_tarinfo(tf: tarfile.TarFile, parent_info: Dict, extensions=IMG_EXTENSIONS):
-    sample_count = 0
-    for i, ti in enumerate(tf):
-        if not ti.isfile():
-            continue
-        dirname, basename = os.path.split(ti.path)
-        name, ext = os.path.splitext(basename)
-        ext = ext.lower()
-        if ext == '.tar':
-            with tarfile.open(fileobj=tf.extractfile(ti), mode='r|') as ctf:
-                child_info = dict(
-                    name=ti.name, path=os.path.join(parent_info['path'], name), ti=ti, children=[], samples=[])
-                sample_count += _extract_tarinfo(ctf, child_info, extensions=extensions)
-                _logger.debug(f'{i}/?. Extracted child tarinfos from {ti.name}. {len(child_info["samples"])} images.')
-                parent_info['children'].append(child_info)
-        elif ext in extensions:
-            parent_info['samples'].append(ti)
-            sample_count += 1
-    return sample_count
-
-
-def extract_tarinfos(root, class_name_to_idx=None, cache_tarinfo=None, extensions=IMG_EXTENSIONS, sort=True):
-    root_is_tar = False
-    if os.path.isfile(root):
-        assert os.path.splitext(root)[-1].lower() == '.tar'
-        tar_filenames = [root]
-        root, root_name = os.path.split(root)
-        root_name = os.path.splitext(root_name)[0]
-        root_is_tar = True
-    else:
-        root_name = root.strip(os.path.sep).split(os.path.sep)[-1]
-        tar_filenames = glob(os.path.join(root, '*.tar'), recursive=True)
-    num_tars = len(tar_filenames)
-    tar_bytes = sum([os.path.getsize(f) for f in tar_filenames])
-    assert num_tars, f'No .tar files found at specified path ({root}).'
-
-    _logger.info(f'Scanning {tar_bytes/1024**2:.2f}MB of tar files...')
-    info = dict(tartrees=[])
-    cache_path = ''
-    if cache_tarinfo is None:
-        cache_tarinfo = True if tar_bytes > 10*1024**3 else False  # FIXME magic number, 10GB
-    if cache_tarinfo:
-        cache_filename = '_' + root_name + CACHE_FILENAME_SUFFIX
-        cache_path = os.path.join(root, cache_filename)
-    if os.path.exists(cache_path):
-        _logger.info(f'Reading tar info from cache file {cache_path}.')
-        with open(cache_path, 'rb') as pf:
-            info = pickle.load(pf)
-        assert len(info['tartrees']) == num_tars, "Cached tartree len doesn't match number of tarfiles"
-    else:
-        for i, fn in enumerate(tar_filenames):
-            path = '' if root_is_tar else os.path.splitext(os.path.basename(fn))[0]
-            with tarfile.open(fn, mode='r|') as tf:  # tarinfo scans done in streaming mode
-                parent_info = dict(name=os.path.relpath(fn, root), path=path, ti=None, children=[], samples=[])
-                num_samples = _extract_tarinfo(tf, parent_info, extensions=extensions)
-                num_children = len(parent_info["children"])
-                _logger.debug(
-                    f'{i}/{num_tars}. Extracted tarinfos from {fn}. {num_children} children, {num_samples} samples.')
-            info['tartrees'].append(parent_info)
-        if cache_path:
-            _logger.info(f'Writing tar info to cache file {cache_path}.')
-            with open(cache_path, 'wb') as pf:
-                pickle.dump(info, pf)
-
-    samples = []
-    labels = []
-    build_class_map = False
-    if class_name_to_idx is None:
-        build_class_map = True
-
-    # Flatten tartree info into lists of samples and targets w/ targets based on label id via
-    # class map arg or from unique paths.
-    # NOTE: currently only flattening up to two-levels, filesystem .tars and then one level of sub-tar children
-    # this covers my current use cases and keeps things a little easier to test for now.
-    tarfiles = []
-
-    def _label_from_paths(*path, leaf_only=True):
-        path = os.path.join(*path).strip(os.path.sep)
-        return path.split(os.path.sep)[-1] if leaf_only else path.replace(os.path.sep, '_')
-
-    def _add_samples(info, fn):
-        added = 0
-        for s in info['samples']:
-            label = _label_from_paths(info['path'], os.path.dirname(s.path))
-            if not build_class_map and label not in class_name_to_idx:
-                continue
-            samples.append((s, fn, info['ti']))
-            labels.append(label)
-            added += 1
-        return added
-
-    _logger.info(f'Collecting samples and building tar states.')
-    for parent_info in info['tartrees']:
-        # if tartree has children, we assume all samples are at the child level
-        tar_name = None if root_is_tar else parent_info['name']
-        tar_state = TarState()
-        parent_added = 0
-        for child_info in parent_info['children']:
-            child_added = _add_samples(child_info, fn=tar_name)
-            if child_added:
-                tar_state.children[child_info['name']] = TarState(ti=child_info['ti'])
-            parent_added += child_added
-        parent_added += _add_samples(parent_info, fn=tar_name)
-        if parent_added:
-            tarfiles.append((tar_name, tar_state))
-    del info
-
-    if build_class_map:
-        # build class index
-        sorted_labels = list(sorted(set(labels), key=natural_key))
-        class_name_to_idx = {c: idx for idx, c in enumerate(sorted_labels)}
-
-    _logger.info(f'Mapping targets and sorting samples.')
-    samples_and_targets = [(s, class_name_to_idx[l]) for s, l in zip(samples, labels) if l in class_name_to_idx]
-    if sort:
-        samples_and_targets = sorted(samples_and_targets, key=lambda k: natural_key(k[0][0].path))
-    samples, targets = zip(*samples_and_targets)
-    samples = np.array(samples)
-    targets = np.array(targets)
-    _logger.info(f'Finished processing {len(samples)} samples across {len(tarfiles)} tar files.')
-    return samples, targets, class_name_to_idx, tarfiles
-
-
-class ParserImageInTar(Parser):
-    """ Multi-tarfile dataset parser where there is one .tar file per class
-    """
-
-    def __init__(self, root, class_map='', cache_tarfiles=True, cache_tarinfo=None):
-        super().__init__()
-
-        class_name_to_idx = None
-        if class_map:
-            class_name_to_idx = load_class_map(class_map, root)
-        self.root = root
-        self.samples, self.targets, self.class_name_to_idx, tarfiles = extract_tarinfos(
-            self.root,
-            class_name_to_idx=class_name_to_idx,
-            cache_tarinfo=cache_tarinfo,
-            extensions=IMG_EXTENSIONS)
-        self.class_idx_to_name = {v: k for k, v in self.class_name_to_idx.items()}
-        if len(tarfiles) == 1 and tarfiles[0][0] is None:
-            self.root_is_tar = True
-            self.tar_state = tarfiles[0][1]
-        else:
-            self.root_is_tar = False
-            self.tar_state = dict(tarfiles)
-        self.cache_tarfiles = cache_tarfiles
-
-    def __len__(self):
-        return len(self.samples)
-
-    def __getitem__(self, index):
-        sample = self.samples[index]
-        target = self.targets[index]
-        sample_ti, parent_fn, child_ti = sample
-        parent_abs = os.path.join(self.root, parent_fn) if parent_fn else self.root
-
-        tf = None
-        cache_state = None
-        if self.cache_tarfiles:
-            cache_state = self.tar_state if self.root_is_tar else self.tar_state[parent_fn]
-            tf = cache_state.tf
-        if tf is None:
-            tf = tarfile.open(parent_abs)
-            if self.cache_tarfiles:
-                cache_state.tf = tf
-        if child_ti is not None:
-            ctf = cache_state.children[child_ti.name].tf if self.cache_tarfiles else None
-            if ctf is None:
-                ctf = tarfile.open(fileobj=tf.extractfile(child_ti))
-                if self.cache_tarfiles:
-                    cache_state.children[child_ti.name].tf = ctf
-            tf = ctf
-
-        return tf.extractfile(sample_ti), target
-
-    def _filename(self, index, basename=False, absolute=False):
-        filename = self.samples[index][0].name
-        if basename:
-            filename = os.path.basename(filename)
-        return filename
diff --git a/AVLFormer/src/timm/data/parsers/parser_image_tar.py b/AVLFormer/src/timm/data/parsers/parser_image_tar.py
deleted file mode 100644
index 53d7df4..0000000
--- a/AVLFormer/src/timm/data/parsers/parser_image_tar.py
+++ /dev/null
@@ -1,72 +0,0 @@
-""" A dataset parser that reads single tarfile based datasets
-
-This parser can read datasets consisting if a single tarfile containing images.
-I am planning to deprecated it in favour of ParerImageInTar.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import os
-import tarfile
-
-from .parser import Parser
-from .class_map import load_class_map
-from .constants import IMG_EXTENSIONS
-from src.timm.utils.misc import natural_key
-
-
-def extract_tarinfo(tarfile, class_to_idx=None, sort=True):
-    files = []
-    labels = []
-    for ti in tarfile.getmembers():
-        if not ti.isfile():
-            continue
-        dirname, basename = os.path.split(ti.path)
-        label = os.path.basename(dirname)
-        ext = os.path.splitext(basename)[1]
-        if ext.lower() in IMG_EXTENSIONS:
-            files.append(ti)
-            labels.append(label)
-    if class_to_idx is None:
-        unique_labels = set(labels)
-        sorted_labels = list(sorted(unique_labels, key=natural_key))
-        class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)}
-    tarinfo_and_targets = [(f, class_to_idx[l]) for f, l in zip(files, labels) if l in class_to_idx]
-    if sort:
-        tarinfo_and_targets = sorted(tarinfo_and_targets, key=lambda k: natural_key(k[0].path))
-    return tarinfo_and_targets, class_to_idx
-
-
-class ParserImageTar(Parser):
-    """ Single tarfile dataset where classes are mapped to folders within tar
-    NOTE: This class is being deprecated in favour of the more capable ParserImageInTar that can
-    operate on folders of tars or tars in tars.
-    """
-    def __init__(self, root, class_map=''):
-        super().__init__()
-
-        class_to_idx = None
-        if class_map:
-            class_to_idx = load_class_map(class_map, root)
-        assert os.path.isfile(root)
-        self.root = root
-
-        with tarfile.open(root) as tf:  # cannot keep this open across processes, reopen later
-            self.samples, self.class_to_idx = extract_tarinfo(tf, class_to_idx)
-        self.imgs = self.samples
-        self.tarfile = None  # lazy init in __getitem__
-
-    def __getitem__(self, index):
-        if self.tarfile is None:
-            self.tarfile = tarfile.open(self.root)
-        tarinfo, target = self.samples[index]
-        fileobj = self.tarfile.extractfile(tarinfo)
-        return fileobj, target
-
-    def __len__(self):
-        return len(self.samples)
-
-    def _filename(self, index, basename=False, absolute=False):
-        filename = self.samples[index][0].name
-        if basename:
-            filename = os.path.basename(filename)
-        return filename
diff --git a/AVLFormer/src/timm/data/parsers/parser_tfds.py b/AVLFormer/src/timm/data/parsers/parser_tfds.py
deleted file mode 100644
index 15361cb..0000000
--- a/AVLFormer/src/timm/data/parsers/parser_tfds.py
+++ /dev/null
@@ -1,201 +0,0 @@
-""" Dataset parser interface that wraps TFDS datasets
-
-Wraps many (most?) TFDS image-classification datasets
-from https://github.com/tensorflow/datasets
-https://www.tensorflow.org/datasets/catalog/overview#image_classification
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import os
-import io
-import math
-import torch
-import torch.distributed as dist
-from PIL import Image
-
-try:
-    import tensorflow as tf
-    tf.config.set_visible_devices([], 'GPU')  # Hands off my GPU! (or pip install tensorflow-cpu)
-    import tensorflow_datasets as tfds
-except ImportError as e:
-    print(e)
-    print("Please install tensorflow_datasets package `pip install tensorflow-datasets`.")
-    exit(1)
-from .parser import Parser
-
-
-MAX_TP_SIZE = 8  # maximum TF threadpool size, only doing jpeg decodes and queuing activities
-SHUFFLE_SIZE = 16834  # samples to shuffle in DS queue
-PREFETCH_SIZE = 4096  # samples to prefetch
-
-
-class ParserTfds(Parser):
-    """ Wrap Tensorflow Datasets for use in PyTorch
-
-    There several things to be aware of:
-      * To prevent excessive samples being dropped per epoch w/ distributed training or multiplicity of
-         dataloader workers, the train iterator wraps to avoid returning partial batches that trigger drop_last
-         https://github.com/pytorch/pytorch/issues/33413
-      * With PyTorch IterableDatasets, each worker in each replica operates in isolation, the final batch
-        from each worker could be a different size. For training this is worked around by option above, for
-        validation extra samples are inserted iff distributed mode is enabled so that the batches being reduced
-        across replicas are of same size. This will slightly alter the results, distributed validation will not be
-        100% correct. This is similar to common handling in DistributedSampler for normal Datasets but a bit worse
-        since there are up to N * J extra samples with IterableDatasets.
-      * The sharding (splitting of dataset into TFRecord) files imposes limitations on the number of
-        replicas and dataloader workers you can use. For really small datasets that only contain a few shards
-        you may have to train non-distributed w/ 1-2 dataloader workers. This is likely not a huge concern as the
-        benefit of distributed training or fast dataloading should be much less for small datasets.
-      * This wrapper is currently configured to return individual, decompressed image samples from the TFDS
-        dataset. The augmentation (transforms) and batching is still done in PyTorch. It would be possible
-        to specify TF augmentation fn and return augmented batches w/ some modifications to other downstream
-        components.
-
-    """
-    def __init__(self, root, name, split='train', shuffle=False, is_training=False, batch_size=None):
-        super().__init__()
-        self.root = root
-        self.split = split
-        self.shuffle = shuffle
-        self.is_training = is_training
-        if self.is_training:
-            assert batch_size is not None,\
-                "Must specify batch_size in training mode for reasonable behaviour w/ TFDS wrapper"
-        self.batch_size = batch_size
-
-        self.builder = tfds.builder(name, data_dir=root)
-        # NOTE: please use tfds command line app to download & prepare datasets, I don't want to call
-        # download_and_prepare() by default here as it's caused issues generating unwanted paths.
-        self.num_samples = self.builder.info.splits[split].num_examples
-        self.ds = None  # initialized lazily on each dataloader worker process
-
-        self.worker_info = None
-        self.dist_rank = 0
-        self.dist_num_replicas = 1
-        if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1:
-            self.dist_rank = dist.get_rank()
-            self.dist_num_replicas = dist.get_world_size()
-
-    def _lazy_init(self):
-        """ Lazily initialize the dataset.
-
-        This is necessary to init the Tensorflow dataset pipeline in the (dataloader) process that
-        will be using the dataset instance. The __init__ method is called on the main process,
-        this will be called in a dataloader worker process.
-
-        NOTE: There will be problems if you try to re-use this dataset across different loader/worker
-        instances once it has been initialized. Do not call any dataset methods that can call _lazy_init
-        before it is passed to dataloader.
-        """
-        worker_info = torch.utils.data.get_worker_info()
-
-        # setup input context to split dataset across distributed processes
-        split = self.split
-        num_workers = 1
-        if worker_info is not None:
-            self.worker_info = worker_info
-            num_workers = worker_info.num_workers
-            worker_id = worker_info.id
-
-            # FIXME I need to spend more time figuring out the best way to distribute/split data across
-            # combo of distributed replicas + dataloader worker processes
-            """
-            InputContext will assign subset of underlying TFRecord files to each 'pipeline' if used.
-            My understanding is that using split, the underling TFRecord files will shuffle (shuffle_files=True)
-            between the splits each iteration, but that understanding could be wrong.
-            Possible split options include:
-              * InputContext for both distributed & worker processes (current)
-              * InputContext for distributed and sub-splits for worker processes
-              * sub-splits for both
-            """
-            # split_size = self.num_samples // num_workers
-            # start = worker_id * split_size
-            # if worker_id == num_workers - 1:
-            #     split = split + '[{}:]'.format(start)
-            # else:
-            #     split = split + '[{}:{}]'.format(start, start + split_size)
-
-        input_context = tf.distribute.InputContext(
-            num_input_pipelines=self.dist_num_replicas * num_workers,
-            input_pipeline_id=self.dist_rank * num_workers + worker_id,
-            num_replicas_in_sync=self.dist_num_replicas  # FIXME does this have any impact?
-        )
-
-        read_config = tfds.ReadConfig(input_context=input_context)
-        ds = self.builder.as_dataset(split=split, shuffle_files=self.shuffle, read_config=read_config)
-        # avoid overloading threading w/ combo fo TF ds threads + PyTorch workers
-        ds.options().experimental_threading.private_threadpool_size = max(1, MAX_TP_SIZE // num_workers)
-        ds.options().experimental_threading.max_intra_op_parallelism = 1
-        if self.is_training:
-            # to prevent excessive drop_last batch behaviour w/ IterableDatasets
-            # see warnings at https://pytorch.org/docs/stable/data.html#multi-process-data-loading
-            ds = ds.repeat()  # allow wrap around and break iteration manually
-        if self.shuffle:
-            ds = ds.shuffle(min(self.num_samples // self._num_pipelines, SHUFFLE_SIZE), seed=0)
-        ds = ds.prefetch(min(self.num_samples // self._num_pipelines, PREFETCH_SIZE))
-        self.ds = tfds.as_numpy(ds)
-
-    def __iter__(self):
-        if self.ds is None:
-            self._lazy_init()
-        # compute a rounded up sample count that is used to:
-        #   1. make batches even cross workers & replicas in distributed validation.
-        #     This adds extra samples and will slightly alter validation results.
-        #   2. determine loop ending condition in training w/ repeat enabled so that only full batch_size
-        #     batches are produced (underlying tfds iter wraps around)
-        target_sample_count = math.ceil(self.num_samples / self._num_pipelines)
-        if self.is_training:
-            # round up to nearest batch_size per worker-replica
-            target_sample_count = math.ceil(target_sample_count / self.batch_size) * self.batch_size
-        sample_count = 0
-        for sample in self.ds:
-            img = Image.fromarray(sample['image'], mode='RGB')
-            yield img, sample['label']
-            sample_count += 1
-            if self.is_training and sample_count >= target_sample_count:
-                # Need to break out of loop when repeat() is enabled for training w/ oversampling
-                # this results in extra samples per epoch but seems more desirable than dropping
-                # up to N*J batches per epoch (where N = num distributed processes, and J = num worker processes)
-                break
-        if not self.is_training and self.dist_num_replicas and 0 < sample_count < target_sample_count:
-            # Validation batch padding only done for distributed training where results are reduced across nodes.
-            # For single process case, it won't matter if workers return different batch sizes.
-            # FIXME this needs more testing, possible for sharding / split api to cause differences of > 1?
-            assert target_sample_count - sample_count == 1  # should only be off by 1 or sharding is not optimal
-            yield img, sample['label']  # yield prev sample again
-            sample_count += 1
-
-    @property
-    def _num_workers(self):
-        return 1 if self.worker_info is None else self.worker_info.num_workers
-
-    @property
-    def _num_pipelines(self):
-        return self._num_workers * self.dist_num_replicas
-
-    def __len__(self):
-        # this is just an estimate and does not factor in extra samples added to pad batches based on
-        # complete worker & replica info (not available until init in dataloader).
-        return math.ceil(self.num_samples / self.dist_num_replicas)
-
-    def _filename(self, index, basename=False, absolute=False):
-        assert False, "Not supported" # no random access to samples
-
-    def filenames(self, basename=False, absolute=False):
-        """ Return all filenames in dataset, overrides base"""
-        if self.ds is None:
-            self._lazy_init()
-        names = []
-        for sample in self.ds:
-            if len(names) > self.num_samples:
-                break  # safety for ds.repeat() case
-            if 'file_name' in sample:
-                name = sample['file_name']
-            elif 'filename' in sample:
-                name = sample['filename']
-            elif 'id' in sample:
-                name = sample['id']
-            else:
-                assert False, "No supported name field present"
-            names.append(name)
-        return names
diff --git a/AVLFormer/src/timm/data/random_erasing.py b/AVLFormer/src/timm/data/random_erasing.py
deleted file mode 100644
index 78967d1..0000000
--- a/AVLFormer/src/timm/data/random_erasing.py
+++ /dev/null
@@ -1,97 +0,0 @@
-""" Random Erasing (Cutout)
-
-Originally inspired by impl at https://github.com/zhunzhong07/Random-Erasing, Apache 2.0
-Copyright Zhun Zhong & Liang Zheng
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import random
-import math
-import torch
-
-
-def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
-    # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
-    # paths, flip the order so normal is run on CPU if this becomes a problem
-    # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
-    if per_pixel:
-        return torch.empty(patch_size, dtype=dtype, device=device).normal_()
-    elif rand_color:
-        return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_()
-    else:
-        return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)
-
-
-class RandomErasing:
-    """ Randomly selects a rectangle region in an image and erases its pixels.
-        'Random Erasing Data Augmentation' by Zhong et al.
-        See https://arxiv.org/pdf/1708.04896.pdf
-
-        This variant of RandomErasing is intended to be applied to either a batch
-        or single image tensor after it has been normalized by dataset mean and std.
-    Args:
-         probability: Probability that the Random Erasing operation will be performed.
-         min_area: Minimum percentage of erased area wrt input image area.
-         max_area: Maximum percentage of erased area wrt input image area.
-         min_aspect: Minimum aspect ratio of erased area.
-         mode: pixel color mode, one of 'const', 'rand', or 'pixel'
-            'const' - erase block is constant color of 0 for all channels
-            'rand'  - erase block is same per-channel random (normal) color
-            'pixel' - erase block is per-pixel random (normal) color
-        max_count: maximum number of erasing blocks per image, area per box is scaled by count.
-            per-image count is randomly chosen between 1 and this value.
-    """
-
-    def __init__(
-            self,
-            probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None,
-            mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'):
-        self.probability = probability
-        self.min_area = min_area
-        self.max_area = max_area
-        max_aspect = max_aspect or 1 / min_aspect
-        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
-        self.min_count = min_count
-        self.max_count = max_count or min_count
-        self.num_splits = num_splits
-        mode = mode.lower()
-        self.rand_color = False
-        self.per_pixel = False
-        if mode == 'rand':
-            self.rand_color = True  # per block random normal
-        elif mode == 'pixel':
-            self.per_pixel = True  # per pixel random normal
-        else:
-            assert not mode or mode == 'const'
-        self.device = device
-
-    def _erase(self, img, chan, img_h, img_w, dtype):
-        if random.random() > self.probability:
-            return
-        area = img_h * img_w
-        count = self.min_count if self.min_count == self.max_count else \
-            random.randint(self.min_count, self.max_count)
-        for _ in range(count):
-            for attempt in range(10):
-                target_area = random.uniform(self.min_area, self.max_area) * area / count
-                aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
-                h = int(round(math.sqrt(target_area * aspect_ratio)))
-                w = int(round(math.sqrt(target_area / aspect_ratio)))
-                if w < img_w and h < img_h:
-                    top = random.randint(0, img_h - h)
-                    left = random.randint(0, img_w - w)
-                    img[:, top:top + h, left:left + w] = _get_pixels(
-                        self.per_pixel, self.rand_color, (chan, h, w),
-                        dtype=dtype, device=self.device)
-                    break
-
-    def __call__(self, input):
-        if len(input.size()) == 3:
-            self._erase(input, *input.size(), input.dtype)
-        else:
-            batch_size, chan, img_h, img_w = input.size()
-            # skip first slice of batch if num_splits is set (for clean portion of samples)
-            batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
-            for i in range(batch_start, batch_size):
-                self._erase(input[i], chan, img_h, img_w, input.dtype)
-        return input
diff --git a/AVLFormer/src/timm/data/real_labels.py b/AVLFormer/src/timm/data/real_labels.py
deleted file mode 100644
index 939c348..0000000
--- a/AVLFormer/src/timm/data/real_labels.py
+++ /dev/null
@@ -1,42 +0,0 @@
-""" Real labels evaluator for ImageNet
-Paper: `Are we done with ImageNet?` - https://arxiv.org/abs/2006.07159
-Based on Numpy example at https://github.com/google-research/reassessed-imagenet
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import os
-import json
-import numpy as np
-
-
-class RealLabelsImagenet:
-
-    def __init__(self, filenames, real_json='real.json', topk=(1, 5)):
-        with open(real_json) as real_labels:
-            real_labels = json.load(real_labels)
-            real_labels = {f'ILSVRC2012_val_{i + 1:08d}.JPEG': labels for i, labels in enumerate(real_labels)}
-        self.real_labels = real_labels
-        self.filenames = filenames
-        assert len(self.filenames) == len(self.real_labels)
-        self.topk = topk
-        self.is_correct = {k: [] for k in topk}
-        self.sample_idx = 0
-
-    def add_result(self, output):
-        maxk = max(self.topk)
-        _, pred_batch = output.topk(maxk, 1, True, True)
-        pred_batch = pred_batch.cpu().numpy()
-        for pred in pred_batch:
-            filename = self.filenames[self.sample_idx]
-            filename = os.path.basename(filename)
-            if self.real_labels[filename]:
-                for k in self.topk:
-                    self.is_correct[k].append(
-                        any([p in self.real_labels[filename] for p in pred[:k]]))
-            self.sample_idx += 1
-
-    def get_accuracy(self, k=None):
-        if k is None:
-            return {k: float(np.mean(self.is_correct[k])) * 100 for k in self.topk}
-        else:
-            return float(np.mean(self.is_correct[k])) * 100
diff --git a/AVLFormer/src/timm/data/tf_preprocessing.py b/AVLFormer/src/timm/data/tf_preprocessing.py
deleted file mode 100644
index 44b4a3a..0000000
--- a/AVLFormer/src/timm/data/tf_preprocessing.py
+++ /dev/null
@@ -1,232 +0,0 @@
-""" Tensorflow Preprocessing Adapter
-
-Allows use of Tensorflow preprocessing pipeline in PyTorch Transform
-
-Copyright of original Tensorflow code below.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ImageNet preprocessing for MnasNet."""
-import tensorflow as tf
-import numpy as np
-
-IMAGE_SIZE = 224
-CROP_PADDING = 32
-
-
-def distorted_bounding_box_crop(image_bytes,
-                                bbox,
-                                min_object_covered=0.1,
-                                aspect_ratio_range=(0.75, 1.33),
-                                area_range=(0.05, 1.0),
-                                max_attempts=100,
-                                scope=None):
-    """Generates cropped_image using one of the bboxes randomly distorted.
-
-    See `tf.image.sample_distorted_bounding_box` for more documentation.
-
-    Args:
-      image_bytes: `Tensor` of binary image data.
-      bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
-          where each coordinate is [0, 1) and the coordinates are arranged
-          as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
-          image.
-      min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
-          area of the image must contain at least this fraction of any bounding
-          box supplied.
-      aspect_ratio_range: An optional list of `float`s. The cropped area of the
-          image must have an aspect ratio = width / height within this range.
-      area_range: An optional list of `float`s. The cropped area of the image
-          must contain a fraction of the supplied image within in this range.
-      max_attempts: An optional `int`. Number of attempts at generating a cropped
-          region of the image of the specified constraints. After `max_attempts`
-          failures, return the entire image.
-      scope: Optional `str` for name scope.
-    Returns:
-      cropped image `Tensor`
-    """
-    with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]):
-        shape = tf.image.extract_jpeg_shape(image_bytes)
-        sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-            shape,
-            bounding_boxes=bbox,
-            min_object_covered=min_object_covered,
-            aspect_ratio_range=aspect_ratio_range,
-            area_range=area_range,
-            max_attempts=max_attempts,
-            use_image_if_no_bounding_boxes=True)
-        bbox_begin, bbox_size, _ = sample_distorted_bounding_box
-
-        # Crop the image to the specified bounding box.
-        offset_y, offset_x, _ = tf.unstack(bbox_begin)
-        target_height, target_width, _ = tf.unstack(bbox_size)
-        crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
-        image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
-
-        return image
-
-
-def _at_least_x_are_equal(a, b, x):
-    """At least `x` of `a` and `b` `Tensors` are equal."""
-    match = tf.equal(a, b)
-    match = tf.cast(match, tf.int32)
-    return tf.greater_equal(tf.reduce_sum(match), x)
-
-
-def _decode_and_random_crop(image_bytes, image_size, resize_method):
-    """Make a random crop of image_size."""
-    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
-    image = distorted_bounding_box_crop(
-        image_bytes,
-        bbox,
-        min_object_covered=0.1,
-        aspect_ratio_range=(3. / 4, 4. / 3.),
-        area_range=(0.08, 1.0),
-        max_attempts=10,
-        scope=None)
-    original_shape = tf.image.extract_jpeg_shape(image_bytes)
-    bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3)
-
-    image = tf.cond(
-        bad,
-        lambda: _decode_and_center_crop(image_bytes, image_size),
-        lambda: tf.image.resize([image], [image_size, image_size], resize_method)[0])
-
-    return image
-
-
-def _decode_and_center_crop(image_bytes, image_size, resize_method):
-    """Crops to center of image with padding then scales image_size."""
-    shape = tf.image.extract_jpeg_shape(image_bytes)
-    image_height = shape[0]
-    image_width = shape[1]
-
-    padded_center_crop_size = tf.cast(
-        ((image_size / (image_size + CROP_PADDING)) *
-         tf.cast(tf.minimum(image_height, image_width), tf.float32)),
-        tf.int32)
-
-    offset_height = ((image_height - padded_center_crop_size) + 1) // 2
-    offset_width = ((image_width - padded_center_crop_size) + 1) // 2
-    crop_window = tf.stack([offset_height, offset_width,
-                            padded_center_crop_size, padded_center_crop_size])
-    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
-    image = tf.image.resize([image], [image_size, image_size], resize_method)[0]
-
-    return image
-
-
-def _flip(image):
-    """Random horizontal image flip."""
-    image = tf.image.random_flip_left_right(image)
-    return image
-
-
-def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
-    """Preprocesses the given image for evaluation.
-
-    Args:
-      image_bytes: `Tensor` representing an image binary of arbitrary size.
-      use_bfloat16: `bool` for whether to use bfloat16.
-      image_size: image size.
-      interpolation: image interpolation method
-
-    Returns:
-      A preprocessed image `Tensor`.
-    """
-    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
-    image = _decode_and_random_crop(image_bytes, image_size, resize_method)
-    image = _flip(image)
-    image = tf.reshape(image, [image_size, image_size, 3])
-    image = tf.image.convert_image_dtype(
-        image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
-    return image
-
-
-def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'):
-    """Preprocesses the given image for evaluation.
-
-    Args:
-      image_bytes: `Tensor` representing an image binary of arbitrary size.
-      use_bfloat16: `bool` for whether to use bfloat16.
-      image_size: image size.
-      interpolation: image interpolation method
-
-    Returns:
-      A preprocessed image `Tensor`.
-    """
-    resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR
-    image = _decode_and_center_crop(image_bytes, image_size, resize_method)
-    image = tf.reshape(image, [image_size, image_size, 3])
-    image = tf.image.convert_image_dtype(
-        image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
-    return image
-
-
-def preprocess_image(image_bytes,
-                     is_training=False,
-                     use_bfloat16=False,
-                     image_size=IMAGE_SIZE,
-                     interpolation='bicubic'):
-    """Preprocesses the given image.
-
-    Args:
-      image_bytes: `Tensor` representing an image binary of arbitrary size.
-      is_training: `bool` for whether the preprocessing is for training.
-      use_bfloat16: `bool` for whether to use bfloat16.
-      image_size: image size.
-      interpolation: image interpolation method
-
-    Returns:
-      A preprocessed image `Tensor` with value range of [0, 255].
-    """
-    if is_training:
-        return preprocess_for_train(image_bytes, use_bfloat16, image_size, interpolation)
-    else:
-        return preprocess_for_eval(image_bytes, use_bfloat16, image_size, interpolation)
-
-
-class TfPreprocessTransform:
-
-    def __init__(self, is_training=False, size=224, interpolation='bicubic'):
-        self.is_training = is_training
-        self.size = size[0] if isinstance(size, tuple) else size
-        self.interpolation = interpolation
-        self._image_bytes = None
-        self.process_image = self._build_tf_graph()
-        self.sess = None
-
-    def _build_tf_graph(self):
-        with tf.device('/cpu:0'):
-            self._image_bytes = tf.placeholder(
-                shape=[],
-                dtype=tf.string,
-            )
-            img = preprocess_image(
-                self._image_bytes, self.is_training, False, self.size, self.interpolation)
-        return img
-
-    def __call__(self, image_bytes):
-        if self.sess is None:
-            self.sess = tf.Session()
-        img = self.sess.run(self.process_image, feed_dict={self._image_bytes: image_bytes})
-        img = img.round().clip(0, 255).astype(np.uint8)
-        if img.ndim < 3:
-            img = np.expand_dims(img, axis=-1)
-        img = np.rollaxis(img, 2)  # HWC to CHW
-        return img
diff --git a/AVLFormer/src/timm/data/transforms.py b/AVLFormer/src/timm/data/transforms.py
deleted file mode 100644
index b3b08e3..0000000
--- a/AVLFormer/src/timm/data/transforms.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import torch
-import torchvision.transforms.functional as F
-from PIL import Image
-import warnings
-import math
-import random
-import numpy as np
-
-
-class ToNumpy:
-
-    def __call__(self, pil_img):
-        np_img = np.array(pil_img, dtype=np.uint8)
-        if np_img.ndim < 3:
-            np_img = np.expand_dims(np_img, axis=-1)
-        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
-        return np_img
-
-
-class ToTensor:
-
-    def __init__(self, dtype=torch.float32):
-        self.dtype = dtype
-
-    def __call__(self, pil_img):
-        np_img = np.array(pil_img, dtype=np.uint8)
-        if np_img.ndim < 3:
-            np_img = np.expand_dims(np_img, axis=-1)
-        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
-        return torch.from_numpy(np_img).to(dtype=self.dtype)
-
-
-_pil_interpolation_to_str = {
-    Image.NEAREST: 'PIL.Image.NEAREST',
-    Image.BILINEAR: 'PIL.Image.BILINEAR',
-    Image.BICUBIC: 'PIL.Image.BICUBIC',
-    Image.LANCZOS: 'PIL.Image.LANCZOS',
-    Image.HAMMING: 'PIL.Image.HAMMING',
-    Image.BOX: 'PIL.Image.BOX',
-}
-
-
-def _pil_interp(method):
-    if method == 'bicubic':
-        return Image.BICUBIC
-    elif method == 'lanczos':
-        return Image.LANCZOS
-    elif method == 'hamming':
-        return Image.HAMMING
-    else:
-        # default bilinear, do we want to allow nearest?
-        return Image.BILINEAR
-
-
-_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
-
-
-class RandomResizedCropAndInterpolation:
-    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
-
-    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
-    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
-    is finally resized to given size.
-    This is popularly used to train the Inception networks.
-
-    Args:
-        size: expected output size of each edge
-        scale: range of size of the origin size cropped
-        ratio: range of aspect ratio of the origin aspect ratio cropped
-        interpolation: Default: PIL.Image.BILINEAR
-    """
-
-    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
-                 interpolation='bilinear'):
-        if isinstance(size, tuple):
-            self.size = size
-        else:
-            self.size = (size, size)
-        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
-            warnings.warn("range should be of kind (min, max)")
-
-        if interpolation == 'random':
-            self.interpolation = _RANDOM_INTERPOLATION
-        else:
-            self.interpolation = _pil_interp(interpolation)
-        self.scale = scale
-        self.ratio = ratio
-
-    @staticmethod
-    def get_params(img, scale, ratio):
-        """Get parameters for ``crop`` for a random sized crop.
-
-        Args:
-            img (PIL Image): Image to be cropped.
-            scale (tuple): range of size of the origin size cropped
-            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
-
-        Returns:
-            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
-                sized crop.
-        """
-        area = img.size[0] * img.size[1]
-
-        for attempt in range(10):
-            target_area = random.uniform(*scale) * area
-            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
-            aspect_ratio = math.exp(random.uniform(*log_ratio))
-
-            w = int(round(math.sqrt(target_area * aspect_ratio)))
-            h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if w <= img.size[0] and h <= img.size[1]:
-                i = random.randint(0, img.size[1] - h)
-                j = random.randint(0, img.size[0] - w)
-                return i, j, h, w
-
-        # Fallback to central crop
-        in_ratio = img.size[0] / img.size[1]
-        if in_ratio < min(ratio):
-            w = img.size[0]
-            h = int(round(w / min(ratio)))
-        elif in_ratio > max(ratio):
-            h = img.size[1]
-            w = int(round(h * max(ratio)))
-        else:  # whole image
-            w = img.size[0]
-            h = img.size[1]
-        i = (img.size[1] - h) // 2
-        j = (img.size[0] - w) // 2
-        return i, j, h, w
-
-    def __call__(self, img):
-        """
-        Args:
-            img (PIL Image): Image to be cropped and resized.
-
-        Returns:
-            PIL Image: Randomly cropped and resized image.
-        """
-        i, j, h, w = self.get_params(img, self.scale, self.ratio)
-        if isinstance(self.interpolation, (tuple, list)):
-            interpolation = random.choice(self.interpolation)
-        else:
-            interpolation = self.interpolation
-        return F.resized_crop(img, i, j, h, w, self.size, interpolation)
-
-    def __repr__(self):
-        if isinstance(self.interpolation, (tuple, list)):
-            interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation])
-        else:
-            interpolate_str = _pil_interpolation_to_str[self.interpolation]
-        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
-        format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
-        format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
-        format_string += ', interpolation={0})'.format(interpolate_str)
-        return format_string
-
-
diff --git a/AVLFormer/src/timm/data/transforms_factory.py b/AVLFormer/src/timm/data/transforms_factory.py
deleted file mode 100644
index f7ecc0f..0000000
--- a/AVLFormer/src/timm/data/transforms_factory.py
+++ /dev/null
@@ -1,236 +0,0 @@
-""" Transforms Factory
-Factory methods for building image transforms for use with TIMM (PyTorch Image Models)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import math
-
-import torch
-from torchvision import transforms
-
-from src.timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
-from src.timm.data.auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
-from src.timm.data.transforms import _pil_interp, RandomResizedCropAndInterpolation, ToNumpy, ToTensor
-from src.timm.data.random_erasing import RandomErasing
-
-
-def transforms_noaug_train(
-        img_size=224,
-        interpolation='bilinear',
-        use_prefetcher=False,
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-):
-    if interpolation == 'random':
-        # random interpolation not supported with no-aug
-        interpolation = 'bilinear'
-    tfl = [
-        transforms.Resize(img_size, _pil_interp(interpolation)),
-        transforms.CenterCrop(img_size)
-    ]
-    if use_prefetcher:
-        # prefetcher and collate will handle tensor conversion and norm
-        tfl += [ToNumpy()]
-    else:
-        tfl += [
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=torch.tensor(mean),
-                std=torch.tensor(std))
-        ]
-    return transforms.Compose(tfl)
-
-
-def transforms_imagenet_train(
-        img_size=224,
-        scale=None,
-        ratio=None,
-        hflip=0.5,
-        vflip=0.,
-        color_jitter=0.4,
-        auto_augment=None,
-        interpolation='random',
-        use_prefetcher=False,
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        re_prob=0.,
-        re_mode='const',
-        re_count=1,
-        re_num_splits=0,
-        separate=False,
-):
-    """
-    If separate==True, the transforms are returned as a tuple of 3 separate transforms
-    for use in a mixing dataset that passes
-     * all data through the first (primary) transform, called the 'clean' data
-     * a portion of the data through the secondary transform
-     * normalizes and converts the branches above with the third, final transform
-    """
-    scale = tuple(scale or (0.08, 1.0))  # default imagenet scale range
-    ratio = tuple(ratio or (3./4., 4./3.))  # default imagenet ratio range
-    primary_tfl = [
-        RandomResizedCropAndInterpolation(img_size, scale=scale, ratio=ratio, interpolation=interpolation)]
-    if hflip > 0.:
-        primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)]
-    if vflip > 0.:
-        primary_tfl += [transforms.RandomVerticalFlip(p=vflip)]
-
-    secondary_tfl = []
-    if auto_augment:
-        assert isinstance(auto_augment, str)
-        if isinstance(img_size, tuple):
-            img_size_min = min(img_size)
-        else:
-            img_size_min = img_size
-        aa_params = dict(
-            translate_const=int(img_size_min * 0.45),
-            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
-        )
-        if interpolation and interpolation != 'random':
-            aa_params['interpolation'] = _pil_interp(interpolation)
-        if auto_augment.startswith('rand'):
-            secondary_tfl += [rand_augment_transform(auto_augment, aa_params)]
-        elif auto_augment.startswith('augmix'):
-            aa_params['translate_pct'] = 0.3
-            secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)]
-        else:
-            secondary_tfl += [auto_augment_transform(auto_augment, aa_params)]
-    elif color_jitter is not None:
-        # color jitter is enabled when not using AA
-        if isinstance(color_jitter, (list, tuple)):
-            # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation
-            # or 4 if also augmenting hue
-            assert len(color_jitter) in (3, 4)
-        else:
-            # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue
-            color_jitter = (float(color_jitter),) * 3
-        secondary_tfl += [transforms.ColorJitter(*color_jitter)]
-
-    final_tfl = []
-    if use_prefetcher:
-        # prefetcher and collate will handle tensor conversion and norm
-        final_tfl += [ToNumpy()]
-    else:
-        final_tfl += [
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=torch.tensor(mean),
-                std=torch.tensor(std))
-        ]
-        if re_prob > 0.:
-            final_tfl.append(
-                RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu'))
-
-    if separate:
-        return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl)
-    else:
-        return transforms.Compose(primary_tfl + secondary_tfl + final_tfl)
-
-
-def transforms_imagenet_eval(
-        img_size=224,
-        crop_pct=None,
-        interpolation='bilinear',
-        use_prefetcher=False,
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD):
-    crop_pct = crop_pct or DEFAULT_CROP_PCT
-
-    if isinstance(img_size, tuple):
-        assert len(img_size) == 2
-        if img_size[-1] == img_size[-2]:
-            # fall-back to older behaviour so Resize scales to shortest edge if target is square
-            scale_size = int(math.floor(img_size[0] / crop_pct))
-        else:
-            scale_size = tuple([int(x / crop_pct) for x in img_size])
-    else:
-        scale_size = int(math.floor(img_size / crop_pct))
-
-    tfl = [
-        transforms.Resize(scale_size, _pil_interp(interpolation)),
-        transforms.CenterCrop(img_size),
-    ]
-    if use_prefetcher:
-        # prefetcher and collate will handle tensor conversion and norm
-        tfl += [ToNumpy()]
-    else:
-        tfl += [
-            transforms.ToTensor(),
-            transforms.Normalize(
-                     mean=torch.tensor(mean),
-                     std=torch.tensor(std))
-        ]
-
-    return transforms.Compose(tfl)
-
-
-def create_transform(
-        input_size,
-        is_training=False,
-        use_prefetcher=False,
-        no_aug=False,
-        scale=None,
-        ratio=None,
-        hflip=0.5,
-        vflip=0.,
-        color_jitter=0.4,
-        auto_augment=None,
-        interpolation='bilinear',
-        mean=IMAGENET_DEFAULT_MEAN,
-        std=IMAGENET_DEFAULT_STD,
-        re_prob=0.,
-        re_mode='const',
-        re_count=1,
-        re_num_splits=0,
-        crop_pct=None,
-        tf_preprocessing=False,
-        separate=False):
-
-    if isinstance(input_size, tuple):
-        img_size = input_size[-2:]
-    else:
-        img_size = input_size
-
-    if tf_preprocessing and use_prefetcher:
-        assert not separate, "Separate transforms not supported for TF preprocessing"
-        from src.timm.data.tf_preprocessing import TfPreprocessTransform
-        transform = TfPreprocessTransform(
-            is_training=is_training, size=img_size, interpolation=interpolation)
-    else:
-        if is_training and no_aug:
-            assert not separate, "Cannot perform split augmentation with no_aug"
-            transform = transforms_noaug_train(
-                img_size,
-                interpolation=interpolation,
-                use_prefetcher=use_prefetcher,
-                mean=mean,
-                std=std)
-        elif is_training:
-            transform = transforms_imagenet_train(
-                img_size,
-                scale=scale,
-                ratio=ratio,
-                hflip=hflip,
-                vflip=vflip,
-                color_jitter=color_jitter,
-                auto_augment=auto_augment,
-                interpolation=interpolation,
-                use_prefetcher=use_prefetcher,
-                mean=mean,
-                std=std,
-                re_prob=re_prob,
-                re_mode=re_mode,
-                re_count=re_count,
-                re_num_splits=re_num_splits,
-                separate=separate)
-        else:
-            assert not separate, "Separate transforms not supported for validation preprocessing"
-            transform = transforms_imagenet_eval(
-                img_size,
-                interpolation=interpolation,
-                use_prefetcher=use_prefetcher,
-                mean=mean,
-                std=std,
-                crop_pct=crop_pct)
-
-    return transform
diff --git a/AVLFormer/src/timm/models/__init__.py b/AVLFormer/src/timm/models/__init__.py
deleted file mode 100644
index 16ced3d..0000000
--- a/AVLFormer/src/timm/models/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from .byoanet import *
-from .byobnet import *
-from .cspnet import *
-from .densenet import *
-from .dla import *
-from .dpn import *
-from .efficientnet import *
-from .ghostnet import *
-from .gluon_resnet import *
-from .gluon_xception import *
-from .hardcorenas import *
-from .hrnet import *
-from .inception_resnet_v2 import *
-from .inception_v3 import *
-from .inception_v4 import *
-from .mobilenetv3 import *
-from .nasnet import *
-from .nfnet import *
-from .pit import *
-from .pnasnet import *
-from .regnet import *
-from .res2net import *
-from .resnest import *
-from .resnet import *
-from .resnetv2 import *
-from .rexnet import *
-from .selecsls import *
-from .senet import *
-from .sknet import *
-from .swin_transformer import *
-from .tnt import *
-from .tresnet import *
-from .vgg import *
-from .vision_transformer import *
-from .vision_transformer_hybrid import *
-from .vovnet import *
-from .xception import *
-from .xception_aligned import *
-
-from .factory import create_model, split_model_name, safe_model_name
-from .helpers import load_checkpoint, resume_checkpoint, model_parameters
-from .layers import TestTimePoolHead, apply_test_time_pool
-from .layers import convert_splitbn_model
-from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit
-from .registry import register_model, model_entrypoint, list_models, is_model, list_modules, is_model_in_modules,\
-    has_model_default_key, is_model_default_key, get_model_default_value, is_model_pretrained
diff --git a/AVLFormer/src/timm/models/byoanet.py b/AVLFormer/src/timm/models/byoanet.py
deleted file mode 100644
index a683190..0000000
--- a/AVLFormer/src/timm/models/byoanet.py
+++ /dev/null
@@ -1,430 +0,0 @@
-""" Bring-Your-Own-Attention Network
-
-A flexible network w/ dataclass based config for stacking NN blocks including
-self-attention (or similar) layers.
-
-Currently used to implement experimential variants of:
-  * Bottleneck Transformers
-  * Lambda ResNets
-  * HaloNets
-
-Consider all of the models definitions here as experimental WIP and likely to change.
-
-Hacked together by / copyright Ross Wightman, 2021.
-"""
-import math
-from dataclasses import dataclass, field
-from collections import OrderedDict
-from typing import Tuple, List, Optional, Union, Any, Callable
-from functools import partial
-
-import torch
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .byobnet import BlocksCfg, ByobCfg, create_byob_stem, create_byob_stages, create_downsample,\
-    reduce_feat_size, register_block, num_groups, LayerFn, _init_weights
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, DropPath, get_act_layer, convert_norm_act, get_attn, get_self_attn,\
-    make_divisible, to_2tuple
-from .registry import register_model
-
-__all__ = ['ByoaNet']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc',
-        'fixed_input_size': False, 'min_input_size': (3, 224, 224),
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # GPU-Efficient (ResNet) weights
-    'botnet50t_224': _cfg(url='', fixed_input_size=True),
-    'botnet50t_c4c5_224': _cfg(url='', fixed_input_size=True),
-
-    'halonet_h1': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
-    'halonet_h1_c4c5': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
-    'halonet26t': _cfg(url=''),
-    'halonet50t': _cfg(url=''),
-
-    'lambda_resnet26t': _cfg(url='', min_input_size=(3, 128, 128)),
-    'lambda_resnet50t': _cfg(url='', min_input_size=(3, 128, 128)),
-}
-
-
-@dataclass
-class ByoaBlocksCfg(BlocksCfg):
-    # FIXME allow overriding self_attn layer or args per block/stage,
-    pass
-
-
-@dataclass
-class ByoaCfg(ByobCfg):
-    blocks: Tuple[Union[ByoaBlocksCfg, Tuple[ByoaBlocksCfg, ...]], ...] = None
-    self_attn_layer: Optional[str] = None
-    self_attn_fixed_size: bool = False
-    self_attn_kwargs: dict = field(default_factory=lambda: dict())
-
-
-def interleave_attn(
-        types : Tuple[str, str], every: Union[int, List[int]], d, first: bool = False, **kwargs
-) -> Tuple[ByoaBlocksCfg]:
-    """ interleave attn blocks
-    """
-    assert len(types) == 2
-    if isinstance(every, int):
-        every = list(range(0 if first else every, d, every))
-        if not every:
-            every = [d - 1]
-    set(every)
-    blocks = []
-    for i in range(d):
-        block_type = types[1] if i in every else types[0]
-        blocks += [ByoaBlocksCfg(type=block_type, d=1, **kwargs)]
-    return tuple(blocks)
-
-
-model_cfgs = dict(
-
-    botnet50t=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=3, c=256, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=6, c=1024, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=1, gs=0, br=0.25),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='',
-        num_features=0,
-        self_attn_layer='bottleneck',
-        self_attn_fixed_size=True,
-        self_attn_kwargs=dict()
-    ),
-    botnet50t_c4c5=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
-            (
-                ByoaBlocksCfg(type='self_attn', d=1, c=1024, s=2, gs=0, br=0.25),
-                ByoaBlocksCfg(type='bottle', d=5, c=1024, s=1, gs=0, br=0.25),
-            ),
-            (
-                ByoaBlocksCfg(type='self_attn', d=1, c=2048, s=2, gs=0, br=0.25),
-                ByoaBlocksCfg(type='bottle', d=2, c=2048, s=1, gs=0, br=0.25),
-            )
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='bottleneck',
-        self_attn_fixed_size=True,
-        self_attn_kwargs=dict()
-    ),
-
-    halonet_h1=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='self_attn', d=3, c=64, s=1, gs=0, br=1.0),
-            ByoaBlocksCfg(type='self_attn', d=3, c=128, s=2, gs=0, br=1.0),
-            ByoaBlocksCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0),
-            ByoaBlocksCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0),
-        ),
-        stem_chs=64,
-        stem_type='7x7',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=3),
-    ),
-    halonet_h1_c4c5=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=3, c=64, s=1, gs=0, br=1.0),
-            ByoaBlocksCfg(type='bottle', d=3, c=128, s=2, gs=0, br=1.0),
-            ByoaBlocksCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0),
-            ByoaBlocksCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=3),
-    ),
-    halonet26t=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=2, c=1024, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=7, halo_size=2)
-    ),
-    halonet50t=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=6, c=1024, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=7, halo_size=2)
-    ),
-
-    lambda_resnet26t=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
-            interleave_attn(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='lambda',
-        self_attn_kwargs=dict()
-    ),
-    lambda_resnet50t=ByoaCfg(
-        blocks=(
-            ByoaBlocksCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
-            ByoaBlocksCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
-            interleave_attn(types=('bottle', 'self_attn'), every=3, d=6, c=1024, s=2, gs=0, br=0.25),
-            ByoaBlocksCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
-        ),
-        stem_chs=64,
-        stem_type='tiered',
-        stem_pool='maxpool',
-        num_features=0,
-        self_attn_layer='lambda',
-        self_attn_kwargs=dict()
-    ),
-)
-
-
-@dataclass
-class ByoaLayerFn(LayerFn):
-    self_attn: Optional[Callable] = None
-
-
-class SelfAttnBlock(nn.Module):
-    """ ResNet-like Bottleneck Block - 1x1 - optional kxk - self attn - 1x1
-    """
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None,
-                 downsample='avg', extra_conv=False, linear_out=False, post_attn_na=True, feat_size=None,
-                 layers: ByoaLayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(SelfAttnBlock, self).__init__()
-        assert layers is not None
-        mid_chs = make_divisible(out_chs * bottle_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = create_downsample(
-                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
-                apply_act=False, layers=layers)
-        else:
-            self.shortcut = nn.Identity()
-
-        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
-        if extra_conv:
-            self.conv2_kxk = layers.conv_norm_act(
-                mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-                groups=groups, drop_block=drop_block)
-            stride = 1  # striding done via conv if enabled
-        else:
-            self.conv2_kxk = nn.Identity()
-        opt_kwargs = {} if feat_size is None else dict(feat_size=feat_size)
-        # FIXME need to dilate self attn to have dilated network support, moop moop
-        self.self_attn = layers.self_attn(mid_chs, stride=stride, **opt_kwargs)
-        self.post_attn = layers.norm_act(mid_chs) if post_attn_na else nn.Identity()
-        self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        if zero_init_last_bn:
-            nn.init.zeros_(self.conv3_1x1.bn.weight)
-
-    def forward(self, x):
-        shortcut = self.shortcut(x)
-
-        x = self.conv1_1x1(x)
-        x = self.conv2_kxk(x)
-        x = self.self_attn(x)
-        x = self.post_attn(x)
-        x = self.conv3_1x1(x)
-        x = self.drop_path(x)
-
-        x = self.act(x + shortcut)
-        return x
-
-register_block('self_attn', SelfAttnBlock)
-
-
-def _byoa_block_args(block_kwargs, block_cfg: ByoaBlocksCfg, model_cfg: ByoaCfg, feat_size=None):
-    if block_cfg.type == 'self_attn' and model_cfg.self_attn_fixed_size:
-        assert feat_size is not None
-        block_kwargs['feat_size'] = feat_size
-    return block_kwargs
-
-
-def get_layer_fns(cfg: ByoaCfg):
-    act = get_act_layer(cfg.act_layer)
-    norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act)
-    conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act)
-    attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
-    self_attn = partial(get_self_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None
-    layer_fn = ByoaLayerFn(
-        conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn)
-    return layer_fn
-
-
-class ByoaNet(nn.Module):
-    """ 'Bring-your-own-attention' Net
-
-    A ResNet inspired backbone that supports interleaving traditional residual blocks with
-    'Self Attention' bottleneck blocks that replace the bottleneck kxk conv w/ a self-attention
-    or similar module.
-
-    FIXME This class network definition is almost the same as ByobNet, I'd like to merge them but
-    torchscript limitations prevent sensible inheritance overrides.
-    """
-    def __init__(self, cfg: ByoaCfg, num_classes=1000, in_chans=3, output_stride=32, global_pool='avg',
-                 zero_init_last_bn=True, img_size=None, drop_rate=0., drop_path_rate=0.):
-        super().__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        layers = get_layer_fns(cfg)
-        feat_size = to_2tuple(img_size) if img_size is not None else None
-
-        self.feature_info = []
-        stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor))
-        self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers)
-        self.feature_info.extend(stem_feat[:-1])
-        feat_size = reduce_feat_size(feat_size, stride=stem_feat[-1]['reduction'])
-
-        self.stages, stage_feat = create_byob_stages(
-            cfg, drop_path_rate, output_stride, stem_feat[-1],
-            feat_size=feat_size, layers=layers, extra_args_fn=_byoa_block_args)
-        self.feature_info.extend(stage_feat[:-1])
-
-        prev_chs = stage_feat[-1]['num_chs']
-        if cfg.num_features:
-            self.num_features = int(round(cfg.width_factor * cfg.num_features))
-            self.final_conv = layers.conv_norm_act(prev_chs, self.num_features, 1)
-        else:
-            self.num_features = prev_chs
-            self.final_conv = nn.Identity()
-        self.feature_info += [
-            dict(num_chs=self.num_features, reduction=stage_feat[-1]['reduction'], module='final_conv')]
-
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-        for n, m in self.named_modules():
-            _init_weights(m, n)
-        for m in self.modules():
-            # call each block's weight init for block-specific overrides to init above
-            if hasattr(m, 'init_weights'):
-                m.init_weights(zero_init_last_bn=zero_init_last_bn)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        x = self.final_conv(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_byoanet(variant, cfg_variant=None, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ByoaNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant],
-        feature_cfg=dict(flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def botnet50t_224(pretrained=False, **kwargs):
-    """ Bottleneck Transformer w/ ResNet50-T backbone. Bottleneck attn in final stage.
-    """
-    kwargs.setdefault('img_size', 224)
-    return _create_byoanet('botnet50t_224', 'botnet50t', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def botnet50t_c4c5_224(pretrained=False, **kwargs):
-    """ Bottleneck Transformer w/ ResNet50-T backbone. Bottleneck attn in last two stages.
-    """
-    kwargs.setdefault('img_size', 224)
-    return _create_byoanet('botnet50t_c4c5_224', 'botnet50t_c4c5', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def halonet_h1(pretrained=False, **kwargs):
-    """ HaloNet-H1. Halo attention in all stages as per the paper.
-
-    This runs very slowly, param count lower than paper --> something is wrong.
-    """
-    return _create_byoanet('halonet_h1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def halonet_h1_c4c5(pretrained=False, **kwargs):
-    """ HaloNet-H1 config w/ attention in last two stages.
-    """
-    return _create_byoanet('halonet_h1_c4c5', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def halonet26t(pretrained=False, **kwargs):
-    """ HaloNet w/ a ResNet26-t backbone, Hallo attention in final stage
-    """
-    return _create_byoanet('halonet26t', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def halonet50t(pretrained=False, **kwargs):
-    """ HaloNet w/ a ResNet50-t backbone, Hallo attention in final stage
-    """
-    return _create_byoanet('halonet50t', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def lambda_resnet26t(pretrained=False, **kwargs):
-    """ Lambda-ResNet-26T. Lambda layers in one C4 stage and all C5.
-    """
-    return _create_byoanet('lambda_resnet26t', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def lambda_resnet50t(pretrained=False, **kwargs):
-    """ Lambda-ResNet-50T. Lambda layers in one C4 stage and all C5.
-    """
-    return _create_byoanet('lambda_resnet50t', pretrained=pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/byobnet.py b/AVLFormer/src/timm/models/byobnet.py
deleted file mode 100644
index c5b1e3f..0000000
--- a/AVLFormer/src/timm/models/byobnet.py
+++ /dev/null
@@ -1,838 +0,0 @@
-""" Bring-Your-Own-Blocks Network
-
-A flexible network w/ dataclass based config for stacking those NN blocks.
-
-This model is currently used to implement the following networks:
-
-GPU Efficient (ResNets) - gernet_l/m/s (original versions called genet, but this was already used (by SENet author)).
-Paper: `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
-Code and weights: https://github.com/idstcv/GPU-Efficient-Networks, licensed Apache 2.0
-
-RepVGG - repvgg_*
-Paper: `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-Code and weights: https://github.com/DingXiaoH/RepVGG, licensed MIT
-
-In all cases the models have been modified to fit within the design of ByobNet. I've remapped
-the original weights and verified accuracies.
-
-For GPU Efficient nets, I used the original names for the blocks since they were for the most part
-the same as original residual blocks in ResNe(X)t, DarkNet, and other existing models. Note also some
-changes introduced in RegNet were also present in the stem and bottleneck blocks for this model.
-
-A significant number of different network archs can be implemented here, including variants of the
-above nets that include attention.
-
-Hacked together by / copyright Ross Wightman, 2021.
-"""
-import math
-from dataclasses import dataclass, field, replace
-from collections import OrderedDict
-from typing import Tuple, List, Optional, Union, Any, Callable, Sequence
-from functools import partial
-
-import torch
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, BatchNormAct2d, DropPath, AvgPool2dSame, \
-    create_conv2d, get_act_layer, convert_norm_act, get_attn, make_divisible
-from .registry import register_model
-
-__all__ = ['ByobNet', 'ByobCfg', 'BlocksCfg', 'create_byob_stem', 'create_block']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # GPU-Efficient (ResNet) weights
-    'gernet_s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_s-756b4751.pth'),
-    'gernet_m': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_m-0873c53a.pth'),
-    'gernet_l': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_l-f31e2e8d.pth',
-        input_size=(3, 256, 256), pool_size=(8, 8)),
-
-    # RepVGG weights
-    'repvgg_a2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_a2-c1ee6d2b.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b0': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b0-80ac3f1b.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b1': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1-77ca2989.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b1g4': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1g4-abde5d92.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2-25b7494e.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b2g4': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2g4-165a85f2.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3-199bc50d.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-    'repvgg_b3g4': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3g4-73c370bf.pth',
-        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
-}
-
-
-@dataclass
-class BlocksCfg:
-    type: Union[str, nn.Module]
-    d: int  # block depth (number of block repeats in stage)
-    c: int  # number of output channels for each block in stage
-    s: int = 2  # stride of stage (first block)
-    gs: Optional[Union[int, Callable]] = None  # group-size of blocks in stage, conv is depthwise if gs == 1
-    br: float = 1.  # bottleneck-ratio of blocks in stage
-    no_attn: bool = True  # disable channel attn (ie SE) when layer is set for model
-
-
-@dataclass
-class ByobCfg:
-    blocks: Tuple[Union[BlocksCfg, Tuple[BlocksCfg, ...]], ...]
-    downsample: str = 'conv1x1'
-    stem_type: str = '3x3'
-    stem_pool: str = ''
-    stem_chs: int = 32
-    width_factor: float = 1.0
-    num_features: int = 0  # num out_channels for final conv, no final 1x1 conv if 0
-    zero_init_last_bn: bool = True
-
-    act_layer: str = 'relu'
-    norm_layer: str = 'batchnorm'
-    attn_layer: Optional[str] = None
-    attn_kwargs: dict = field(default_factory=lambda: dict())
-
-
-def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0):
-    c = (64, 128, 256, 512)
-    group_size = 0
-    if groups > 0:
-        group_size = lambda chs, idx: chs // groups if (idx + 1) % 2 == 0 else 0
-    bcfg = tuple([BlocksCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)])
-    return bcfg
-
-
-model_cfgs = dict(
-
-    gernet_l=ByobCfg(
-        blocks=(
-            BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
-            BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
-            BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
-            BlocksCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.),
-            BlocksCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.),
-        ),
-        stem_chs=32,
-        num_features=2560,
-    ),
-    gernet_m=ByobCfg(
-        blocks=(
-            BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
-            BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
-            BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
-            BlocksCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.),
-            BlocksCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.),
-        ),
-        stem_chs=32,
-        num_features=2560,
-    ),
-    gernet_s=ByobCfg(
-        blocks=(
-            BlocksCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.),
-            BlocksCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.),
-            BlocksCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4),
-            BlocksCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.),
-            BlocksCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.),
-        ),
-        stem_chs=13,
-        num_features=1920,
-    ),
-
-    repvgg_a2=ByobCfg(
-        blocks=_rep_vgg_bcfg(d=(2, 4, 14, 1), wf=(1.5, 1.5, 1.5, 2.75)),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b0=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(1., 1., 1., 2.5)),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b1=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.)),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b1g4=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.), groups=4),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b2=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.)),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b2g4=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.), groups=4),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b3=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.)),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-    repvgg_b3g4=ByobCfg(
-        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.), groups=4),
-        stem_type='rep',
-        stem_chs=64,
-    ),
-
-    resnet52q=ByobCfg(
-        blocks=(
-            BlocksCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
-            BlocksCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
-            BlocksCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25),
-            BlocksCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0),
-        ),
-        stem_chs=128,
-        stem_type='quad',
-        num_features=2048,
-        act_layer='silu',
-    ),
-)
-
-
-def expand_blocks_cfg(stage_blocks_cfg: Union[BlocksCfg, Sequence[BlocksCfg]]) -> List[BlocksCfg]:
-    if not isinstance(stage_blocks_cfg, Sequence):
-        stage_blocks_cfg = (stage_blocks_cfg,)
-    block_cfgs = []
-    for i, cfg in enumerate(stage_blocks_cfg):
-        block_cfgs += [replace(cfg, d=1) for _ in range(cfg.d)]
-    return block_cfgs
-
-
-def num_groups(group_size, channels):
-    if not group_size:  # 0 or None
-        return 1  # normal conv with 1 group
-    else:
-        # NOTE group_size == 1 -> depthwise conv
-        assert channels % group_size == 0
-        return channels // group_size
-
-
-@dataclass
-class LayerFn:
-    conv_norm_act: Callable = ConvBnAct
-    norm_act: Callable = BatchNormAct2d
-    act: Callable = nn.ReLU
-    attn: Optional[Callable] = None
-
-
-class DownsampleAvg(nn.Module):
-    def __init__(self, in_chs, out_chs, stride=1, dilation=1, apply_act=False, layers: LayerFn = None):
-        """ AvgPool Downsampling as in 'D' ResNet variants."""
-        super(DownsampleAvg, self).__init__()
-        layers = layers or LayerFn()
-        avg_stride = stride if dilation == 1 else 1
-        if stride > 1 or dilation > 1:
-            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-        else:
-            self.pool = nn.Identity()
-        self.conv = layers.conv_norm_act(in_chs, out_chs, 1, apply_act=apply_act)
-
-    def forward(self, x):
-        return self.conv(self.pool(x))
-
-
-def create_downsample(downsample_type, layers: LayerFn, **kwargs):
-    if downsample_type == 'avg':
-        return DownsampleAvg(**kwargs)
-    else:
-        return layers.conv_norm_act(kwargs.pop('in_chs'), kwargs.pop('out_chs'), kernel_size=1, **kwargs)
-
-
-class BasicBlock(nn.Module):
-    """ ResNet Basic Block - kxk + kxk
-    """
-
-    def __init__(
-            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), group_size=None, bottle_ratio=1.0,
-            downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(BasicBlock, self).__init__()
-        layers = layers or LayerFn()
-        mid_chs = make_divisible(out_chs * bottle_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = create_downsample(
-                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
-                apply_act=False, layers=layers)
-        else:
-            self.shortcut = nn.Identity()
-
-        self.conv1_kxk = layers.conv_norm_act(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0])
-        self.conv2_kxk = layers.conv_norm_act(
-            mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_block=drop_block, apply_act=False)
-        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        if zero_init_last_bn:
-            nn.init.zeros_(self.conv2_kxk.bn.weight)
-
-    def forward(self, x):
-        shortcut = self.shortcut(x)
-
-        # residual path
-        x = self.conv1_kxk(x)
-        x = self.conv2_kxk(x)
-        x = self.attn(x)
-        x = self.drop_path(x)
-
-        x = self.act(x + shortcut)
-        return x
-
-
-class BottleneckBlock(nn.Module):
-    """ ResNet-like Bottleneck Block - 1x1 - kxk - 1x1
-    """
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None,
-                 downsample='avg', linear_out=False, layers : LayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(BottleneckBlock, self).__init__()
-        layers = layers or LayerFn()
-        mid_chs = make_divisible(out_chs * bottle_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = create_downsample(
-                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
-                apply_act=False, layers=layers)
-        else:
-            self.shortcut = nn.Identity()
-
-        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
-        self.conv2_kxk = layers.conv_norm_act(
-            mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups, drop_block=drop_block)
-        self.attn = nn.Identity() if layers.attn is None else layers.attn(mid_chs)
-        self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        if zero_init_last_bn:
-            nn.init.zeros_(self.conv3_1x1.bn.weight)
-
-    def forward(self, x):
-        shortcut = self.shortcut(x)
-
-        x = self.conv1_1x1(x)
-        x = self.conv2_kxk(x)
-        x = self.attn(x)
-        x = self.conv3_1x1(x)
-        x = self.drop_path(x)
-
-        x = self.act(x + shortcut)
-        return x
-
-
-class DarkBlock(nn.Module):
-    """ DarkNet-like (1x1 + 3x3 w/ stride) block
-
-    The GE-Net impl included a 1x1 + 3x3 block in their search space. It was not used in the feature models.
-    This block is pretty much a DarkNet block (also DenseNet) hence the name. Neither DarkNet or DenseNet
-    uses strides within the block (external 3x3 or maxpool downsampling is done in front of the block repeats).
-
-    If one does want to use a lot of these blocks w/ stride, I'd recommend using the EdgeBlock (3x3 /w stride + 1x1)
-    for more optimal compute.
-    """
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
-                 downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(DarkBlock, self).__init__()
-        layers = layers or LayerFn()
-        mid_chs = make_divisible(out_chs * bottle_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = create_downsample(
-                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
-                apply_act=False, layers=layers)
-        else:
-            self.shortcut = nn.Identity()
-
-        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
-        self.conv2_kxk = layers.conv_norm_act(
-            mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups,  drop_block=drop_block, apply_act=False)
-        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        if zero_init_last_bn:
-            nn.init.zeros_(self.conv2_kxk.bn.weight)
-
-    def forward(self, x):
-        shortcut = self.shortcut(x)
-
-        x = self.conv1_1x1(x)
-        x = self.conv2_kxk(x)
-        x = self.attn(x)
-        x = self.drop_path(x)
-        x = self.act(x + shortcut)
-        return x
-
-
-class EdgeBlock(nn.Module):
-    """ EdgeResidual-like (3x3 + 1x1) block
-
-    A two layer block like DarkBlock, but with the order of the 3x3 and 1x1 convs reversed.
-    Very similar to the EfficientNet Edge-Residual block but this block it ends with activations, is
-    intended to be used with either expansion or bottleneck contraction, and can use DW/group/non-grouped convs.
-
-    FIXME is there a more common 3x3 + 1x1 conv block to name this after?
-    """
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
-                 downsample='avg', linear_out=False, layers: LayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(EdgeBlock, self).__init__()
-        layers = layers or LayerFn()
-        mid_chs = make_divisible(out_chs * bottle_ratio)
-        groups = num_groups(group_size, mid_chs)
-
-        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
-            self.shortcut = create_downsample(
-                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
-                apply_act=False, layers=layers)
-        else:
-            self.shortcut = nn.Identity()
-
-        self.conv1_kxk = layers.conv_norm_act(
-            in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups,  drop_block=drop_block)
-        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
-        self.conv2_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        if zero_init_last_bn:
-            nn.init.zeros_(self.conv2_1x1.bn.weight)
-
-    def forward(self, x):
-        shortcut = self.shortcut(x)
-
-        x = self.conv1_kxk(x)
-        x = self.attn(x)
-        x = self.conv2_1x1(x)
-        x = self.drop_path(x)
-        x = self.act(x + shortcut)
-        return x
-
-
-class RepVggBlock(nn.Module):
-    """ RepVGG Block.
-
-    Adapted from impl at https://github.com/DingXiaoH/RepVGG
-
-    This version does not currently support the deploy optimization. It is currently fixed in 'train' mode.
-    """
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
-                 downsample='', layers : LayerFn = None, drop_block=None, drop_path_rate=0.):
-        super(RepVggBlock, self).__init__()
-        layers = layers or LayerFn()
-        groups = num_groups(group_size, in_chs)
-
-        use_ident = in_chs == out_chs and stride == 1 and dilation[0] == dilation[1]
-        self.identity = layers.norm_act(out_chs, apply_act=False) if use_ident else None
-        self.conv_kxk = layers.conv_norm_act(
-            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups, drop_block=drop_block, apply_act=False)
-        self.conv_1x1 = layers.conv_norm_act(in_chs, out_chs, 1, stride=stride, groups=groups, apply_act=False)
-        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity()
-        self.act = layers.act(inplace=True)
-
-    def init_weights(self, zero_init_last_bn=False):
-        # NOTE this init overrides that base model init with specific changes for the block type
-        for m in self.modules():
-            if isinstance(m, nn.BatchNorm2d):
-                nn.init.normal_(m.weight, .1, .1)
-                nn.init.normal_(m.bias, 0, .1)
-
-    def forward(self, x):
-        if self.identity is None:
-            x = self.conv_1x1(x) + self.conv_kxk(x)
-        else:
-            identity = self.identity(x)
-            x = self.conv_1x1(x) + self.conv_kxk(x)
-            x = self.drop_path(x)  # not in the paper / official impl, experimental
-            x = x + identity
-        x = self.attn(x)  # no attn in the paper / official impl, experimental
-        x = self.act(x)
-        return x
-
-
-_block_registry = dict(
-    basic=BasicBlock,
-    bottle=BottleneckBlock,
-    dark=DarkBlock,
-    edge=EdgeBlock,
-    rep=RepVggBlock,
-)
-
-
-def register_block(block_type:str, block_fn: nn.Module):
-    _block_registry[block_type] = block_fn
-
-
-def create_block(block: Union[str, nn.Module], **kwargs):
-    if isinstance(block, (nn.Module, partial)):
-        return block(**kwargs)
-    assert block in _block_registry, f'Unknown block type ({block}'
-    return _block_registry[block](**kwargs)
-
-
-class Stem(nn.Sequential):
-
-    def __init__(self, in_chs, out_chs, kernel_size=3, stride=4, pool='maxpool',
-                 num_rep=3, num_act=None, chs_decay=0.5, layers: LayerFn = None):
-        super().__init__()
-        assert stride in (2, 4)
-        layers = layers or LayerFn()
-
-        if isinstance(out_chs, (list, tuple)):
-            num_rep = len(out_chs)
-            stem_chs = out_chs
-        else:
-            stem_chs = [round(out_chs * chs_decay ** i) for i in range(num_rep)][::-1]
-
-        self.stride = stride
-        self.feature_info = []  # track intermediate features
-        prev_feat = ''
-        stem_strides = [2] + [1] * (num_rep - 1)
-        if stride == 4 and not pool:
-            # set last conv in stack to be strided if stride == 4 and no pooling layer
-            stem_strides[-1] = 2
-
-        num_act = num_rep if num_act is None else num_act
-        # if num_act < num_rep, first convs in stack won't have bn + act
-        stem_norm_acts = [False] * (num_rep - num_act) + [True] * num_act
-        prev_chs = in_chs
-        curr_stride = 1
-        for i, (ch, s, na) in enumerate(zip(stem_chs, stem_strides, stem_norm_acts)):
-            layer_fn = layers.conv_norm_act if na else create_conv2d
-            conv_name = f'conv{i + 1}'
-            if i > 0 and s > 1:
-                self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
-            self.add_module(conv_name, layer_fn(prev_chs, ch, kernel_size=kernel_size, stride=s))
-            prev_chs = ch
-            curr_stride *= s
-            prev_feat = conv_name
-
-        if 'max' in pool.lower():
-            self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
-            self.add_module('pool', nn.MaxPool2d(3, 2, 1))
-            curr_stride *= 2
-            prev_feat = 'pool'
-
-        self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat))
-        assert curr_stride == stride
-
-
-def create_byob_stem(in_chs, out_chs, stem_type='', pool_type='', feat_prefix='stem', layers: LayerFn = None):
-    layers = layers or LayerFn()
-    assert stem_type in ('', 'quad', 'tiered', 'deep', 'rep', '7x7', '3x3')
-    if 'quad' in stem_type:
-        # based on NFNet stem, stack of 4 3x3 convs
-        num_act = 2 if 'quad2' in stem_type else None
-        stem = Stem(in_chs, out_chs, num_rep=4, num_act=num_act, pool=pool_type, layers=layers)
-    elif 'tiered' in stem_type:
-        # 3x3 stack of 3 convs as in my ResNet-T
-        stem = Stem(in_chs, (3 * out_chs // 8, out_chs // 2, out_chs), pool=pool_type, layers=layers)
-    elif 'deep' in stem_type:
-        # 3x3 stack of 3 convs as in ResNet-D
-        stem = Stem(in_chs, out_chs, num_rep=3, chs_decay=1.0, pool=pool_type, layers=layers)
-    elif 'rep' in stem_type:
-        stem = RepVggBlock(in_chs, out_chs, stride=2, layers=layers)
-    elif '7x7' in stem_type:
-        # 7x7 stem conv as in ResNet
-        if pool_type:
-            stem = Stem(in_chs, out_chs, 7, num_rep=1, pool=pool_type, layers=layers)
-        else:
-            stem = layers.conv_norm_act(in_chs, out_chs, 7, stride=2)
-    else:
-        # 3x3 stem conv as in RegNet is the default
-        if pool_type:
-            stem = Stem(in_chs, out_chs, 3, num_rep=1, pool=pool_type, layers=layers)
-        else:
-            stem = layers.conv_norm_act(in_chs, out_chs, 3, stride=2)
-
-    if isinstance(stem, Stem):
-        feature_info = [dict(f, module='.'.join([feat_prefix, f['module']])) for f in stem.feature_info]
-    else:
-        feature_info = [dict(num_chs=out_chs, reduction=2, module=feat_prefix)]
-    return stem, feature_info
-
-
-def reduce_feat_size(feat_size, stride=2):
-    return None if feat_size is None else tuple([s // stride for s in feat_size])
-
-
-def create_byob_stages(
-        cfg, drop_path_rate, output_stride, stem_feat,
-        feat_size=None, layers=None, extra_args_fn=None):
-    layers = layers or LayerFn()
-    feature_info = []
-    block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks]
-    depths = [sum([bc.d for bc in stage_bcs]) for stage_bcs in block_cfgs]
-    dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
-    dilation = 1
-    net_stride = stem_feat['reduction']
-    prev_chs = stem_feat['num_chs']
-    prev_feat = stem_feat
-    stages = []
-    for stage_idx, stage_block_cfgs in enumerate(block_cfgs):
-        stride = stage_block_cfgs[0].s
-        if stride != 1 and prev_feat:
-            feature_info.append(prev_feat)
-        if net_stride >= output_stride and stride > 1:
-            dilation *= stride
-            stride = 1
-        net_stride *= stride
-        first_dilation = 1 if dilation in (1, 2) else 2
-
-        blocks = []
-        for block_idx, block_cfg in enumerate(stage_block_cfgs):
-            out_chs = make_divisible(block_cfg.c * cfg.width_factor)
-            group_size = block_cfg.gs
-            if isinstance(group_size, Callable):
-                group_size = group_size(out_chs, block_idx)
-            block_kwargs = dict(  # Blocks used in this model must accept these arguments
-                in_chs=prev_chs,
-                out_chs=out_chs,
-                stride=stride if block_idx == 0 else 1,
-                dilation=(first_dilation, dilation),
-                group_size=group_size,
-                bottle_ratio=block_cfg.br,
-                downsample=cfg.downsample,
-                drop_path_rate=dpr[stage_idx][block_idx],
-                layers=layers,
-            )
-            if extra_args_fn is not None:
-                extra_args_fn(block_kwargs, block_cfg=block_cfg, model_cfg=cfg, feat_size=feat_size)
-            blocks += [create_block(block_cfg.type, **block_kwargs)]
-            first_dilation = dilation
-            prev_chs = out_chs
-            if stride > 1 and block_idx == 0:
-                feat_size = reduce_feat_size(feat_size, stride)
-
-        stages += [nn.Sequential(*blocks)]
-        prev_feat = dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')
-
-    feature_info.append(prev_feat)
-    return nn.Sequential(*stages), feature_info
-
-
-def get_layer_fns(cfg: ByobCfg):
-    act = get_act_layer(cfg.act_layer)
-    norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act)
-    conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act)
-    attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
-    layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn)
-    return layer_fn
-
-
-class ByobNet(nn.Module):
-    """ 'Bring-your-own-blocks' Net
-
-    A flexible network backbone that allows building model stem + blocks via
-    dataclass cfg definition w/ factory functions for module instantiation.
-
-    Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act).
-    """
-    def __init__(self, cfg: ByobCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
-                 zero_init_last_bn=True, drop_rate=0., drop_path_rate=0.):
-        super().__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        layers = get_layer_fns(cfg)
-
-        self.feature_info = []
-        stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor))
-        self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers)
-        self.feature_info.extend(stem_feat[:-1])
-
-        self.stages, stage_feat = create_byob_stages(cfg, drop_path_rate, output_stride, stem_feat[-1], layers=layers)
-        self.feature_info.extend(stage_feat[:-1])
-
-        prev_chs = stage_feat[-1]['num_chs']
-        if cfg.num_features:
-            self.num_features = int(round(cfg.width_factor * cfg.num_features))
-            self.final_conv = layers.conv_norm_act(prev_chs, self.num_features, 1)
-        else:
-            self.num_features = prev_chs
-            self.final_conv = nn.Identity()
-        self.feature_info += [
-            dict(num_chs=self.num_features, reduction=stage_feat[-1]['reduction'], module='final_conv')]
-
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-        for n, m in self.named_modules():
-            _init_weights(m, n)
-        for m in self.modules():
-            # call each block's weight init for block-specific overrides to init above
-            if hasattr(m, 'init_weights'):
-                m.init_weights(zero_init_last_bn=zero_init_last_bn)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        x = self.final_conv(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _init_weights(m, n=''):
-    if isinstance(m, nn.Conv2d):
-        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-        fan_out //= m.groups
-        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-        if m.bias is not None:
-            m.bias.data.zero_()
-    elif isinstance(m, nn.Linear):
-        nn.init.normal_(m.weight, mean=0.0, std=0.01)
-        if m.bias is not None:
-            nn.init.zeros_(m.bias)
-    elif isinstance(m, nn.BatchNorm2d):
-        nn.init.ones_(m.weight)
-        nn.init.zeros_(m.bias)
-
-
-def _create_byobnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ByobNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=model_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def gernet_l(pretrained=False, **kwargs):
-    """ GEResNet-Large (GENet-Large from official impl)
-    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
-    """
-    return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def gernet_m(pretrained=False, **kwargs):
-    """ GEResNet-Medium (GENet-Normal from official impl)
-    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
-    """
-    return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def gernet_s(pretrained=False, **kwargs):
-    """ EResNet-Small (GENet-Small from official impl)
-    `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
-    """
-    return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_a2(pretrained=False, **kwargs):
-    """ RepVGG-A2
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b0(pretrained=False, **kwargs):
-    """ RepVGG-B0
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b1(pretrained=False, **kwargs):
-    """ RepVGG-B1
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b1g4(pretrained=False, **kwargs):
-    """ RepVGG-B1g4
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b2(pretrained=False, **kwargs):
-    """ RepVGG-B2
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b2g4(pretrained=False, **kwargs):
-    """ RepVGG-B2g4
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b3(pretrained=False, **kwargs):
-    """ RepVGG-B3
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def repvgg_b3g4(pretrained=False, **kwargs):
-    """ RepVGG-B3g4
-    `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
-    """
-    return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/cspnet.py b/AVLFormer/src/timm/models/cspnet.py
deleted file mode 100644
index 87dd706..0000000
--- a/AVLFormer/src/timm/models/cspnet.py
+++ /dev/null
@@ -1,457 +0,0 @@
-"""PyTorch CspNet
-
-A PyTorch implementation of Cross Stage Partial Networks including:
-* CSPResNet50
-* CSPResNeXt50
-* CSPDarkNet53
-* and DarkNet53 for good measure
-
-Based on paper `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
-
-Reference impl via darknet cfg files at https://github.com/WongKinYiu/CrossStagePartialNetworks
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, DropPath, create_attn, get_norm_act_layer
-from .registry import register_model
-
-
-__all__ = ['CspNet']  # model_registry will add each entrypoint fn to this
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8),
-        'crop_pct': 0.887, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'cspresnet50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnet50_ra-d3e8d487.pth'),
-    'cspresnet50d': _cfg(url=''),
-    'cspresnet50w': _cfg(url=''),
-    'cspresnext50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnext50_ra_224-648b4713.pth',
-        input_size=(3, 224, 224), pool_size=(7, 7), crop_pct=0.875  # FIXME I trained this at 224x224, not 256 like ref impl
-    ),
-    'cspresnext50_iabn': _cfg(url=''),
-    'cspdarknet53': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspdarknet53_ra_256-d05c7c21.pth'),
-    'cspdarknet53_iabn': _cfg(url=''),
-    'darknet53': _cfg(url=''),
-}
-
-
-model_cfgs = dict(
-    cspresnet50=dict(
-        stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
-        stage=dict(
-            out_chs=(128, 256, 512, 1024),
-            depth=(3, 3, 5, 2),
-            stride=(1,) + (2,) * 3,
-            exp_ratio=(2.,) * 4,
-            bottle_ratio=(0.5,) * 4,
-            block_ratio=(1.,) * 4,
-            cross_linear=True,
-        )
-    ),
-    cspresnet50d=dict(
-        stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
-        stage=dict(
-            out_chs=(128, 256, 512, 1024),
-            depth=(3, 3, 5, 2),
-            stride=(1,) + (2,) * 3,
-            exp_ratio=(2.,) * 4,
-            bottle_ratio=(0.5,) * 4,
-            block_ratio=(1.,) * 4,
-            cross_linear=True,
-        )
-    ),
-    cspresnet50w=dict(
-        stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
-        stage=dict(
-            out_chs=(256, 512, 1024, 2048),
-            depth=(3, 3, 5, 2),
-            stride=(1,) + (2,) * 3,
-            exp_ratio=(1.,) * 4,
-            bottle_ratio=(0.25,) * 4,
-            block_ratio=(0.5,) * 4,
-            cross_linear=True,
-        )
-    ),
-    cspresnext50=dict(
-        stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
-        stage=dict(
-            out_chs=(256, 512, 1024, 2048),
-            depth=(3, 3, 5, 2),
-            stride=(1,) + (2,) * 3,
-            groups=(32,) * 4,
-            exp_ratio=(1.,) * 4,
-            bottle_ratio=(1.,) * 4,
-            block_ratio=(0.5,) * 4,
-            cross_linear=True,
-        )
-    ),
-    cspdarknet53=dict(
-        stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
-        stage=dict(
-            out_chs=(64, 128, 256, 512, 1024),
-            depth=(1, 2, 8, 8, 4),
-            stride=(2,) * 5,
-            exp_ratio=(2.,) + (1.,) * 4,
-            bottle_ratio=(0.5,) + (1.0,) * 4,
-            block_ratio=(1.,) + (0.5,) * 4,
-            down_growth=True,
-        )
-    ),
-    darknet53=dict(
-        stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
-        stage=dict(
-            out_chs=(64, 128, 256, 512, 1024),
-            depth=(1, 2, 8, 8, 4),
-            stride=(2,) * 5,
-            bottle_ratio=(0.5,) * 5,
-            block_ratio=(1.,) * 5,
-        )
-    )
-)
-
-
-def create_stem(
-        in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
-        act_layer=None, norm_layer=None, aa_layer=None):
-    stem = nn.Sequential()
-    if not isinstance(out_chs, (tuple, list)):
-        out_chs = [out_chs]
-    assert len(out_chs)
-    in_c = in_chans
-    for i, out_c in enumerate(out_chs):
-        conv_name = f'conv{i + 1}'
-        stem.add_module(conv_name, ConvBnAct(
-            in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
-            act_layer=act_layer, norm_layer=norm_layer))
-        in_c = out_c
-        last_conv = conv_name
-    if pool:
-        if aa_layer is not None:
-            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
-            stem.add_module('aa', aa_layer(channels=in_c, stride=2))
-        else:
-            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
-    return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
-
-
-class ResBottleneck(nn.Module):
-    """ ResNe(X)t Bottleneck Block
-    """
-
-    def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False,
-                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
-        super(ResBottleneck, self).__init__()
-        mid_chs = int(round(out_chs * bottle_ratio))
-        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
-
-        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
-        self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
-        self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
-        self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
-        self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
-        self.drop_path = drop_path
-        self.act3 = act_layer(inplace=True)
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.conv3.bn.weight)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.conv1(x)
-        x = self.conv2(x)
-        if self.attn2 is not None:
-            x = self.attn2(x)
-        x = self.conv3(x)
-        if self.attn3 is not None:
-            x = self.attn3(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        x = x + shortcut
-        # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
-        #x[:, :shortcut.size(1)] += shortcut
-        x = self.act3(x)
-        return x
-
-
-class DarkBlock(nn.Module):
-    """ DarkNet Block
-    """
-
-    def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
-                 drop_block=None, drop_path=None):
-        super(DarkBlock, self).__init__()
-        mid_chs = int(round(out_chs * bottle_ratio))
-        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
-        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
-        self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
-        self.attn = create_attn(attn_layer, channels=out_chs)
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.conv2.bn.weight)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.conv1(x)
-        x = self.conv2(x)
-        if self.attn is not None:
-            x = self.attn(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        x = x + shortcut
-        return x
-
-
-class CrossStage(nn.Module):
-    """Cross Stage."""
-    def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1.,
-                 groups=1, first_dilation=None, down_growth=False, cross_linear=False, block_dpr=None,
-                 block_fn=ResBottleneck, **block_kwargs):
-        super(CrossStage, self).__init__()
-        first_dilation = first_dilation or dilation
-        down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
-        exp_chs = int(round(out_chs * exp_ratio))
-        block_out_chs = int(round(out_chs * block_ratio))
-        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))
-
-        if stride != 1 or first_dilation != dilation:
-            self.conv_down = ConvBnAct(
-                in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
-                aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
-            prev_chs = down_chs
-        else:
-            self.conv_down = None
-            prev_chs = in_chs
-
-        # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
-        # there is also special case for the first stage for some of the model that results in uneven split
-        # across the two paths. I did it this way for simplicity for now.
-        self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs)
-        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
-
-        self.blocks = nn.Sequential()
-        for i in range(depth):
-            drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
-            self.blocks.add_module(str(i), block_fn(
-                prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
-            prev_chs = block_out_chs
-
-        # transition convs
-        self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
-        self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
-
-    def forward(self, x):
-        if self.conv_down is not None:
-            x = self.conv_down(x)
-        x = self.conv_exp(x)
-        split = x.shape[1] // 2
-        xs, xb = x[:, :split], x[:, split:]
-        xb = self.blocks(xb)
-        xb = self.conv_transition_b(xb).contiguous()
-        out = self.conv_transition(torch.cat([xs, xb], dim=1))
-        return out
-
-
-class DarkStage(nn.Module):
-    """DarkNet stage."""
-
-    def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1,
-                 first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs):
-        super(DarkStage, self).__init__()
-        first_dilation = first_dilation or dilation
-
-        self.conv_down = ConvBnAct(
-            in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
-            act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
-            aa_layer=block_kwargs.get('aa_layer', None))
-
-        prev_chs = out_chs
-        block_out_chs = int(round(out_chs * block_ratio))
-        self.blocks = nn.Sequential()
-        for i in range(depth):
-            drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
-            self.blocks.add_module(str(i), block_fn(
-                prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
-            prev_chs = block_out_chs
-
-    def forward(self, x):
-        x = self.conv_down(x)
-        x = self.blocks(x)
-        return x
-
-
-def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.):
-    # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
-    num_stages = len(cfg['depth'])
-    if 'groups' not in cfg:
-        cfg['groups'] = (1,) * num_stages
-    if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)):
-        cfg['down_growth'] = (cfg['down_growth'],) * num_stages
-    if 'cross_linear' in cfg and not isinstance(cfg['cross_linear'], (list, tuple)):
-        cfg['cross_linear'] = (cfg['cross_linear'],) * num_stages
-    cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \
-        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])]
-    stage_strides = []
-    stage_dilations = []
-    stage_first_dilations = []
-    dilation = 1
-    for cfg_stride in cfg['stride']:
-        stage_first_dilations.append(dilation)
-        if curr_stride >= output_stride:
-            dilation *= cfg_stride
-            stride = 1
-        else:
-            stride = cfg_stride
-            curr_stride *= stride
-        stage_strides.append(stride)
-        stage_dilations.append(dilation)
-    cfg['stride'] = stage_strides
-    cfg['dilation'] = stage_dilations
-    cfg['first_dilation'] = stage_first_dilations
-    stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())]
-    return stage_args
-
-
-class CspNet(nn.Module):
-    """Cross Stage Partial base model.
-
-    Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
-    Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
-
-    NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
-    darknet impl. I did it this way for simplicity and less special cases.
-    """
-
-    def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg', drop_rate=0.,
-                 act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0.,
-                 zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck):
-        super().__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        assert output_stride in (8, 16, 32)
-        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
-
-        # Construct the stem
-        self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args)
-        self.feature_info = [stem_feat_info]
-        prev_chs = stem_feat_info['num_chs']
-        curr_stride = stem_feat_info['reduction']  # reduction does not include pool
-        if cfg['stem']['pool']:
-            curr_stride *= 2
-
-        # Construct the stages
-        per_stage_args = _cfg_to_stage_args(
-            cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate)
-        self.stages = nn.Sequential()
-        for i, sa in enumerate(per_stage_args):
-            self.stages.add_module(
-                str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn))
-            prev_chs = sa['out_chs']
-            curr_stride *= sa['stride']
-            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
-
-        # Construct the head
-        self.num_features = prev_chs
-        self.head = ClassifierHead(
-            in_chs=prev_chs, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, mean=0.0, std=0.01)
-                nn.init.zeros_(m.bias)
-        if zero_init_last_bn:
-            for m in self.modules():
-                if hasattr(m, 'zero_init_last_bn'):
-                    m.zero_init_last_bn()
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_cspnet(variant, pretrained=False, **kwargs):
-    cfg_variant = variant.split('_')[0]
-    return build_model_with_cfg(
-        CspNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True), model_cfg=model_cfgs[cfg_variant],
-        **kwargs)
-
-
-@register_model
-def cspresnet50(pretrained=False, **kwargs):
-    return _create_cspnet('cspresnet50', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def cspresnet50d(pretrained=False, **kwargs):
-    return _create_cspnet('cspresnet50d', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def cspresnet50w(pretrained=False, **kwargs):
-    return _create_cspnet('cspresnet50w', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def cspresnext50(pretrained=False, **kwargs):
-    return _create_cspnet('cspresnext50', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def cspresnext50_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
-    return _create_cspnet('cspresnext50_iabn', pretrained=pretrained, norm_layer=norm_layer, **kwargs)
-
-
-@register_model
-def cspdarknet53(pretrained=False, **kwargs):
-    return _create_cspnet('cspdarknet53', pretrained=pretrained, block_fn=DarkBlock, **kwargs)
-
-
-@register_model
-def cspdarknet53_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
-    return _create_cspnet('cspdarknet53_iabn', pretrained=pretrained, block_fn=DarkBlock, norm_layer=norm_layer, **kwargs)
-
-
-@register_model
-def darknet53(pretrained=False, **kwargs):
-    return _create_cspnet('darknet53', pretrained=pretrained, block_fn=DarkBlock, stage_fn=DarkStage, **kwargs)
diff --git a/AVLFormer/src/timm/models/densenet.py b/AVLFormer/src/timm/models/densenet.py
deleted file mode 100644
index 36fd8cc..0000000
--- a/AVLFormer/src/timm/models/densenet.py
+++ /dev/null
@@ -1,387 +0,0 @@
-"""Pytorch Densenet implementation w/ tweaks
-This file is a copy of https://github.com/pytorch/vision 'densenet.py' (BSD-3-Clause) with
-fixed kwargs passthrough and addition of dynamic global avg/max pool.
-"""
-import re
-from collections import OrderedDict
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from torch.jit.annotations import List
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import BatchNormAct2d, create_norm_act, BlurPool2d, create_classifier
-from .registry import register_model
-
-__all__ = ['DenseNet']
-
-
-def _cfg(url=''):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'features.conv0', 'classifier': 'classifier',
-    }
-
-
-default_cfgs = {
-    'densenet121': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenet121_ra-50efcf5c.pth'),
-    'densenet121d': _cfg(url=''),
-    'densenetblur121d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenetblur121d_ra-100dcfbc.pth'),
-    'densenet169': _cfg(url='https://download.pytorch.org/models/densenet169-b2777c0a.pth'),
-    'densenet201': _cfg(url='https://download.pytorch.org/models/densenet201-c1103571.pth'),
-    'densenet161': _cfg(url='https://download.pytorch.org/models/densenet161-8d451a50.pth'),
-    'densenet264': _cfg(url=''),
-    'densenet264d_iabn': _cfg(url=''),
-    'tv_densenet121': _cfg(url='https://download.pytorch.org/models/densenet121-a639ec97.pth'),
-}
-
-
-class DenseLayer(nn.Module):
-    def __init__(self, num_input_features, growth_rate, bn_size, norm_layer=BatchNormAct2d,
-                 drop_rate=0., memory_efficient=False):
-        super(DenseLayer, self).__init__()
-        self.add_module('norm1', norm_layer(num_input_features)),
-        self.add_module('conv1', nn.Conv2d(
-            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', norm_layer(bn_size * growth_rate)),
-        self.add_module('conv2', nn.Conv2d(
-            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)),
-        self.drop_rate = float(drop_rate)
-        self.memory_efficient = memory_efficient
-
-    def bottleneck_fn(self, xs):
-        # type: (List[torch.Tensor]) -> torch.Tensor
-        concated_features = torch.cat(xs, 1)
-        bottleneck_output = self.conv1(self.norm1(concated_features))  # noqa: T484
-        return bottleneck_output
-
-    # todo: rewrite when torchscript supports any
-    def any_requires_grad(self, x):
-        # type: (List[torch.Tensor]) -> bool
-        for tensor in x:
-            if tensor.requires_grad:
-                return True
-        return False
-
-    @torch.jit.unused  # noqa: T484
-    def call_checkpoint_bottleneck(self, x):
-        # type: (List[torch.Tensor]) -> torch.Tensor
-        def closure(*xs):
-            return self.bottleneck_fn(xs)
-
-        return cp.checkpoint(closure, *x)
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (List[torch.Tensor]) -> (torch.Tensor)
-        pass
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (torch.Tensor) -> (torch.Tensor)
-        pass
-
-    # torchscript does not yet support *args, so we overload method
-    # allowing it to take either a List[Tensor] or single Tensor
-    def forward(self, x):  # noqa: F811
-        if isinstance(x, torch.Tensor):
-            prev_features = [x]
-        else:
-            prev_features = x
-
-        if self.memory_efficient and self.any_requires_grad(prev_features):
-            if torch.jit.is_scripting():
-                raise Exception("Memory Efficient not supported in JIT")
-            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
-        else:
-            bottleneck_output = self.bottleneck_fn(prev_features)
-
-        new_features = self.conv2(self.norm2(bottleneck_output))
-        if self.drop_rate > 0:
-            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
-        return new_features
-
-
-class DenseBlock(nn.ModuleDict):
-    _version = 2
-
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, norm_layer=nn.ReLU,
-                 drop_rate=0., memory_efficient=False):
-        super(DenseBlock, self).__init__()
-        for i in range(num_layers):
-            layer = DenseLayer(
-                num_input_features + i * growth_rate,
-                growth_rate=growth_rate,
-                bn_size=bn_size,
-                norm_layer=norm_layer,
-                drop_rate=drop_rate,
-                memory_efficient=memory_efficient,
-            )
-            self.add_module('denselayer%d' % (i + 1), layer)
-
-    def forward(self, init_features):
-        features = [init_features]
-        for name, layer in self.items():
-            new_features = layer(features)
-            features.append(new_features)
-        return torch.cat(features, 1)
-
-
-class DenseTransition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features, norm_layer=nn.BatchNorm2d, aa_layer=None):
-        super(DenseTransition, self).__init__()
-        self.add_module('norm', norm_layer(num_input_features))
-        self.add_module('conv', nn.Conv2d(
-            num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
-        if aa_layer is not None:
-            self.add_module('pool', aa_layer(num_output_features, stride=2))
-        else:
-            self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
-
-
-class DenseNet(nn.Module):
-    r"""Densenet-BC model class, based on
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
-
-    Args:
-        growth_rate (int) - how many filters to add each layer (`k` in paper)
-        block_config (list of 4 ints) - how many layers in each pooling block
-        bn_size (int) - multiplicative factor for number of bottle neck layers
-          (i.e. bn_size * k features in the bottleneck layer)
-        drop_rate (float) - dropout rate after each dense layer
-        num_classes (int) - number of classification classes
-        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
-          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
-    """
-
-    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), bn_size=4, stem_type='',
-                 num_classes=1000, in_chans=3, global_pool='avg',
-                 norm_layer=BatchNormAct2d, aa_layer=None, drop_rate=0, memory_efficient=False,
-                 aa_stem_only=True):
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        super(DenseNet, self).__init__()
-
-        # Stem
-        deep_stem = 'deep' in stem_type  # 3x3 deep stem
-        num_init_features = growth_rate * 2
-        if aa_layer is None:
-            stem_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        else:
-            stem_pool = nn.Sequential(*[
-                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
-                aa_layer(channels=num_init_features, stride=2)])
-        if deep_stem:
-            stem_chs_1 = stem_chs_2 = growth_rate
-            if 'tiered' in stem_type:
-                stem_chs_1 = 3 * (growth_rate // 4)
-                stem_chs_2 = num_init_features if 'narrow' in stem_type else 6 * (growth_rate // 4)
-            self.features = nn.Sequential(OrderedDict([
-                ('conv0', nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False)),
-                ('norm0', norm_layer(stem_chs_1)),
-                ('conv1', nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False)),
-                ('norm1', norm_layer(stem_chs_2)),
-                ('conv2', nn.Conv2d(stem_chs_2, num_init_features, 3, stride=1, padding=1, bias=False)),
-                ('norm2', norm_layer(num_init_features)),
-                ('pool0', stem_pool),
-            ]))
-        else:
-            self.features = nn.Sequential(OrderedDict([
-                ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-                ('norm0', norm_layer(num_init_features)),
-                ('pool0', stem_pool),
-            ]))
-        self.feature_info = [
-            dict(num_chs=num_init_features, reduction=2, module=f'features.norm{2 if deep_stem else 0}')]
-        current_stride = 4
-
-        # DenseBlocks
-        num_features = num_init_features
-        for i, num_layers in enumerate(block_config):
-            block = DenseBlock(
-                num_layers=num_layers,
-                num_input_features=num_features,
-                bn_size=bn_size,
-                growth_rate=growth_rate,
-                norm_layer=norm_layer,
-                drop_rate=drop_rate,
-                memory_efficient=memory_efficient
-            )
-            module_name = f'denseblock{(i + 1)}'
-            self.features.add_module(module_name, block)
-            num_features = num_features + num_layers * growth_rate
-            transition_aa_layer = None if aa_stem_only else aa_layer
-            if i != len(block_config) - 1:
-                self.feature_info += [
-                    dict(num_chs=num_features, reduction=current_stride, module='features.' + module_name)]
-                current_stride *= 2
-                trans = DenseTransition(
-                    num_input_features=num_features, num_output_features=num_features // 2,
-                    norm_layer=norm_layer, aa_layer=transition_aa_layer)
-                self.features.add_module(f'transition{i + 1}', trans)
-                num_features = num_features // 2
-
-        # Final batch norm
-        self.features.add_module('norm5', norm_layer(num_features))
-
-        self.feature_info += [dict(num_chs=num_features, reduction=current_stride, module='features.norm5')]
-        self.num_features = num_features
-
-        # Linear layer
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-        # Official init from torch repo.
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.constant_(m.bias, 0)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        return self.features(x)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        # both classifier and block drop?
-        # if self.drop_rate > 0.:
-        #     x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.classifier(x)
-        return x
-
-
-def _filter_torchvision_pretrained(state_dict):
-    pattern = re.compile(
-        r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
-
-    for key in list(state_dict.keys()):
-        res = pattern.match(key)
-        if res:
-            new_key = res.group(1) + res.group(2)
-            state_dict[new_key] = state_dict[key]
-            del state_dict[key]
-    return state_dict
-
-
-def _create_densenet(variant, growth_rate, block_config, pretrained, **kwargs):
-    kwargs['growth_rate'] = growth_rate
-    kwargs['block_config'] = block_config
-    return build_model_with_cfg(
-        DenseNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True), pretrained_filter_fn=_filter_torchvision_pretrained,
-        **kwargs)
-
-
-@register_model
-def densenet121(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenetblur121d(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenetblur121d', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, stem_type='deep',
-        aa_layer=BlurPool2d, **kwargs)
-    return model
-
-
-@register_model
-def densenet121d(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenet169(pretrained=False, **kwargs):
-    r"""Densenet-169 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet169', growth_rate=32, block_config=(6, 12, 32, 32), pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenet201(pretrained=False, **kwargs):
-    r"""Densenet-201 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet201', growth_rate=32, block_config=(6, 12, 48, 32), pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenet161(pretrained=False, **kwargs):
-    r"""Densenet-161 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet161', growth_rate=48, block_config=(6, 12, 36, 24), pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenet264(pretrained=False, **kwargs):
-    r"""Densenet-264 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'densenet264', growth_rate=48, block_config=(6, 12, 64, 48), pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def densenet264d_iabn(pretrained=False, **kwargs):
-    r"""Densenet-264 model with deep stem and Inplace-ABN
-    """
-    def norm_act_fn(num_features, **kwargs):
-        return create_norm_act('iabn', num_features, **kwargs)
-    model = _create_densenet(
-        'densenet264d_iabn', growth_rate=48, block_config=(6, 12, 64, 48), stem_type='deep',
-        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tv_densenet121(pretrained=False, **kwargs):
-    r"""Densenet-121 model with original Torchvision weights, from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-    """
-    model = _create_densenet(
-        'tv_densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/dla.py b/AVLFormer/src/timm/models/dla.py
deleted file mode 100644
index cd15321..0000000
--- a/AVLFormer/src/timm/models/dla.py
+++ /dev/null
@@ -1,441 +0,0 @@
-""" Deep Layer Aggregation and DLA w/ Res2Net
-DLA original adapted from Official Pytorch impl at:
-DLA Paper: `Deep Layer Aggregation` - https://arxiv.org/abs/1707.06484
-
-Res2Net additions from: https://github.com/gasvn/Res2Net/
-Res2Net Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169
-"""
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['DLA']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'base_layer.0', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'dla34': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth'),
-    'dla46_c': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla46_c-2bfd52c3.pth'),
-    'dla46x_c': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla46x_c-d761bae7.pth'),
-    'dla60x_c': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla60x_c-b870c45c.pth'),
-    'dla60': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla60-24839fc4.pth'),
-    'dla60x': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla60x-d15cacda.pth'),
-    'dla102': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla102-d94d9790.pth'),
-    'dla102x': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla102x-ad62be81.pth'),
-    'dla102x2': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla102x2-262837b6.pth'),
-    'dla169': _cfg(url='http://dl.yf.io/dla/models/imagenet/dla169-0914e092.pth'),
-    'dla60_res2net': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net_dla60_4s-d88db7f9.pth'),
-    'dla60_res2next': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next_dla60_4s-d327927b.pth'),
-}
-
-
-class DlaBasic(nn.Module):
-    """DLA Basic"""
-
-    def __init__(self, inplanes, planes, stride=1, dilation=1, **_):
-        super(DlaBasic, self).__init__()
-        self.conv1 = nn.Conv2d(
-            inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.stride = stride
-
-    def forward(self, x, residual=None):
-        if residual is None:
-            residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class DlaBottleneck(nn.Module):
-    """DLA/DLA-X Bottleneck"""
-    expansion = 2
-
-    def __init__(self, inplanes, outplanes, stride=1, dilation=1, cardinality=1, base_width=64):
-        super(DlaBottleneck, self).__init__()
-        self.stride = stride
-        mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality)
-        mid_planes = mid_planes // self.expansion
-
-        self.conv1 = nn.Conv2d(inplanes, mid_planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(mid_planes)
-        self.conv2 = nn.Conv2d(
-            mid_planes, mid_planes, kernel_size=3, stride=stride, padding=dilation,
-            bias=False, dilation=dilation, groups=cardinality)
-        self.bn2 = nn.BatchNorm2d(mid_planes)
-        self.conv3 = nn.Conv2d(mid_planes, outplanes, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(outplanes)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x, residual=None):
-        if residual is None:
-            residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class DlaBottle2neck(nn.Module):
-    """ Res2Net/Res2NeXT DLA Bottleneck
-    Adapted from https://github.com/gasvn/Res2Net/blob/master/dla.py
-    """
-    expansion = 2
-
-    def __init__(self, inplanes, outplanes, stride=1, dilation=1, scale=4, cardinality=8, base_width=4):
-        super(DlaBottle2neck, self).__init__()
-        self.is_first = stride > 1
-        self.scale = scale
-        mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality)
-        mid_planes = mid_planes // self.expansion
-        self.width = mid_planes
-
-        self.conv1 = nn.Conv2d(inplanes, mid_planes * scale, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(mid_planes * scale)
-
-        num_scale_convs = max(1, scale - 1)
-        convs = []
-        bns = []
-        for _ in range(num_scale_convs):
-            convs.append(nn.Conv2d(
-                mid_planes, mid_planes, kernel_size=3, stride=stride,
-                padding=dilation, dilation=dilation, groups=cardinality, bias=False))
-            bns.append(nn.BatchNorm2d(mid_planes))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        if self.is_first:
-            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
-
-        self.conv3 = nn.Conv2d(mid_planes * scale, outplanes, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(outplanes)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x, residual=None):
-        if residual is None:
-            residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        spx = torch.split(out, self.width, 1)
-        spo = []
-        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            sp = spx[i] if i == 0 or self.is_first else sp + spx[i]
-            sp = conv(sp)
-            sp = bn(sp)
-            sp = self.relu(sp)
-            spo.append(sp)
-        if self.scale > 1:
-            spo.append(self.pool(spx[-1]) if self.is_first else spx[-1])
-        out = torch.cat(spo, 1)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-class DlaRoot(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, residual):
-        super(DlaRoot, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2)
-        self.bn = nn.BatchNorm2d(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.residual = residual
-
-    def forward(self, *x):
-        children = x
-        x = self.conv(torch.cat(x, 1))
-        x = self.bn(x)
-        if self.residual:
-            x += children[0]
-        x = self.relu(x)
-
-        return x
-
-
-class DlaTree(nn.Module):
-    def __init__(self, levels, block, in_channels, out_channels, stride=1,
-                 dilation=1, cardinality=1, base_width=64,
-                 level_root=False, root_dim=0, root_kernel_size=1, root_residual=False):
-        super(DlaTree, self).__init__()
-        if root_dim == 0:
-            root_dim = 2 * out_channels
-        if level_root:
-            root_dim += in_channels
-        self.downsample = nn.MaxPool2d(stride, stride=stride) if stride > 1 else nn.Identity()
-        self.project = nn.Identity()
-        cargs = dict(dilation=dilation, cardinality=cardinality, base_width=base_width)
-        if levels == 1:
-            self.tree1 = block(in_channels, out_channels, stride, **cargs)
-            self.tree2 = block(out_channels, out_channels, 1, **cargs)
-            if in_channels != out_channels:
-                # NOTE the official impl/weights have  project layers in levels > 1 case that are never
-                # used, I've moved the project layer here to avoid wasted params but old checkpoints will
-                # need strict=False while loading.
-                self.project = nn.Sequential(
-                    nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
-                    nn.BatchNorm2d(out_channels))
-        else:
-            cargs.update(dict(root_kernel_size=root_kernel_size, root_residual=root_residual))
-            self.tree1 = DlaTree(
-                levels - 1, block, in_channels, out_channels, stride, root_dim=0, **cargs)
-            self.tree2 = DlaTree(
-                levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, **cargs)
-        if levels == 1:
-            self.root = DlaRoot(root_dim, out_channels, root_kernel_size, root_residual)
-        self.level_root = level_root
-        self.root_dim = root_dim
-        self.levels = levels
-
-    def forward(self, x, residual=None, children=None):
-        children = [] if children is None else children
-        bottom = self.downsample(x)
-        residual = self.project(bottom)
-        if self.level_root:
-            children.append(bottom)
-        x1 = self.tree1(x, residual)
-        if self.levels == 1:
-            x2 = self.tree2(x1)
-            x = self.root(x2, x1, *children)
-        else:
-            children.append(x1)
-            x = self.tree2(x1, children=children)
-        return x
-
-
-class DLA(nn.Module):
-    def __init__(self, levels, channels, output_stride=32, num_classes=1000, in_chans=3,
-                 cardinality=1, base_width=64, block=DlaBottle2neck, residual_root=False,
-                 drop_rate=0.0, global_pool='avg'):
-        super(DLA, self).__init__()
-        self.channels = channels
-        self.num_classes = num_classes
-        self.cardinality = cardinality
-        self.base_width = base_width
-        self.drop_rate = drop_rate
-        assert output_stride == 32  # FIXME support dilation
-
-        self.base_layer = nn.Sequential(
-            nn.Conv2d(in_chans, channels[0], kernel_size=7, stride=1, padding=3, bias=False),
-            nn.BatchNorm2d(channels[0]),
-            nn.ReLU(inplace=True))
-        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
-        self.level1 = self._make_conv_level(channels[0], channels[1], levels[1], stride=2)
-        cargs = dict(cardinality=cardinality, base_width=base_width, root_residual=residual_root)
-        self.level2 = DlaTree(levels[2], block, channels[1], channels[2], 2, level_root=False, **cargs)
-        self.level3 = DlaTree(levels[3], block, channels[2], channels[3], 2, level_root=True, **cargs)
-        self.level4 = DlaTree(levels[4], block, channels[3], channels[4], 2, level_root=True, **cargs)
-        self.level5 = DlaTree(levels[5], block, channels[4], channels[5], 2, level_root=True, **cargs)
-        self.feature_info = [
-            dict(num_chs=channels[0], reduction=1, module='level0'),  # rare to have a meaningful stride 1 level
-            dict(num_chs=channels[1], reduction=2, module='level1'),
-            dict(num_chs=channels[2], reduction=4, module='level2'),
-            dict(num_chs=channels[3], reduction=8, module='level3'),
-            dict(num_chs=channels[4], reduction=16, module='level4'),
-            dict(num_chs=channels[5], reduction=32, module='level5'),
-        ]
-
-        self.num_features = channels[-1]
-        self.global_pool, self.fc = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
-        modules = []
-        for i in range(convs):
-            modules.extend([
-                nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1,
-                          padding=dilation, bias=False, dilation=dilation),
-                nn.BatchNorm2d(planes),
-                nn.ReLU(inplace=True)])
-            inplanes = planes
-        return nn.Sequential(*modules)
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
-
-    def forward_features(self, x):
-        x = self.base_layer(x)
-        x = self.level0(x)
-        x = self.level1(x)
-        x = self.level2(x)
-        x = self.level3(x)
-        x = self.level4(x)
-        x = self.level5(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.fc(x)
-        if not self.global_pool.is_identity():
-            x = x.flatten(1)  # conv classifier, flatten if pooling isn't pass-through (disabled)
-        return x
-
-
-def _create_dla(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        DLA, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        pretrained_strict=False,
-        feature_cfg=dict(out_indices=(1, 2, 3, 4, 5)),
-        **kwargs)
-
-
-@register_model
-def dla60_res2net(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024),
-        block=DlaBottle2neck, cardinality=1, base_width=28, **kwargs)
-    return _create_dla('dla60_res2net', pretrained, **model_kwargs)
-
-
-@register_model
-def dla60_res2next(pretrained=False,**kwargs):
-    model_kwargs = dict(
-        levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024),
-        block=DlaBottle2neck, cardinality=8, base_width=4, **kwargs)
-    return _create_dla('dla60_res2next', pretrained, **model_kwargs)
-
-
-@register_model
-def dla34(pretrained=False, **kwargs):  # DLA-34
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 128, 256, 512],
-        block=DlaBasic, **kwargs)
-    return _create_dla('dla34', pretrained, **model_kwargs)
-
-
-@register_model
-def dla46_c(pretrained=False, **kwargs):  # DLA-46-C
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256],
-        block=DlaBottleneck, **kwargs)
-    return _create_dla('dla46_c', pretrained, **model_kwargs)
-
-
-@register_model
-def dla46x_c(pretrained=False, **kwargs):  # DLA-X-46-C
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256],
-        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
-    return _create_dla('dla46x_c', pretrained, **model_kwargs)
-
-
-@register_model
-def dla60x_c(pretrained=False, **kwargs):  # DLA-X-60-C
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 64, 64, 128, 256],
-        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
-    return _create_dla('dla60x_c', pretrained, **model_kwargs)
-
-
-@register_model
-def dla60(pretrained=False, **kwargs):  # DLA-60
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, **kwargs)
-    return _create_dla('dla60', pretrained, **model_kwargs)
-
-
-@register_model
-def dla60x(pretrained=False, **kwargs):  # DLA-X-60
-    model_kwargs = dict(
-        levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, cardinality=32, base_width=4, **kwargs)
-    return _create_dla('dla60x', pretrained, **model_kwargs)
-
-
-@register_model
-def dla102(pretrained=False, **kwargs):  # DLA-102
-    model_kwargs = dict(
-        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, residual_root=True, **kwargs)
-    return _create_dla('dla102', pretrained, **model_kwargs)
-
-
-@register_model
-def dla102x(pretrained=False, **kwargs):  # DLA-X-102
-    model_kwargs = dict(
-        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, cardinality=32, base_width=4, residual_root=True, **kwargs)
-    return _create_dla('dla102x', pretrained, **model_kwargs)
-
-
-@register_model
-def dla102x2(pretrained=False, **kwargs):  # DLA-X-102 64
-    model_kwargs = dict(
-        levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, cardinality=64, base_width=4, residual_root=True, **kwargs)
-    return _create_dla('dla102x2', pretrained, **model_kwargs)
-
-
-@register_model
-def dla169(pretrained=False, **kwargs):  # DLA-169
-    model_kwargs = dict(
-        levels=[1, 1, 2, 3, 5, 1], channels=[16, 32, 128, 256, 512, 1024],
-        block=DlaBottleneck, residual_root=True, **kwargs)
-    return _create_dla('dla169', pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/dpn.py b/AVLFormer/src/timm/models/dpn.py
deleted file mode 100644
index 037df83..0000000
--- a/AVLFormer/src/timm/models/dpn.py
+++ /dev/null
@@ -1,316 +0,0 @@
-""" PyTorch implementation of DualPathNetworks
-Based on original MXNet implementation https://github.com/cypw/DPNs with
-many ideas from another PyTorch implementation https://github.com/oyam/pytorch-DPNs.
-
-This implementation is compatible with the pretrained weights from cypw's MXNet implementation.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from collections import OrderedDict
-from functools import partial
-from typing import Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import BatchNormAct2d, ConvBnAct, create_conv2d, create_classifier
-from .registry import register_model
-
-__all__ = ['DPN']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DPN_MEAN, 'std': IMAGENET_DPN_STD,
-        'first_conv': 'features.conv1_1.conv', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'dpn68': _cfg(
-        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn68-66bebafa7.pth'),
-    'dpn68b': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dpn68b_ra-a31ca160.pth',
-        mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-    'dpn92': _cfg(
-        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn92_extra-b040e4a9b.pth'),
-    'dpn98': _cfg(
-        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn98-5b90dec4d.pth'),
-    'dpn131': _cfg(
-        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn131-71dfe43e0.pth'),
-    'dpn107': _cfg(
-        url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn107_extra-1ac7121e2.pth')
-}
-
-
-class CatBnAct(nn.Module):
-    def __init__(self, in_chs, norm_layer=BatchNormAct2d):
-        super(CatBnAct, self).__init__()
-        self.bn = norm_layer(in_chs, eps=0.001)
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (Tuple[torch.Tensor, torch.Tensor]) -> (torch.Tensor)
-        pass
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (torch.Tensor) -> (torch.Tensor)
-        pass
-
-    def forward(self, x):
-        if isinstance(x, tuple):
-            x = torch.cat(x, dim=1)
-        return self.bn(x)
-
-
-class BnActConv2d(nn.Module):
-    def __init__(self, in_chs, out_chs, kernel_size, stride, groups=1, norm_layer=BatchNormAct2d):
-        super(BnActConv2d, self).__init__()
-        self.bn = norm_layer(in_chs, eps=0.001)
-        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, groups=groups)
-
-    def forward(self, x):
-        return self.conv(self.bn(x))
-
-
-class DualPathBlock(nn.Module):
-    def __init__(
-            self, in_chs, num_1x1_a, num_3x3_b, num_1x1_c, inc, groups, block_type='normal', b=False):
-        super(DualPathBlock, self).__init__()
-        self.num_1x1_c = num_1x1_c
-        self.inc = inc
-        self.b = b
-        if block_type == 'proj':
-            self.key_stride = 1
-            self.has_proj = True
-        elif block_type == 'down':
-            self.key_stride = 2
-            self.has_proj = True
-        else:
-            assert block_type == 'normal'
-            self.key_stride = 1
-            self.has_proj = False
-
-        self.c1x1_w_s1 = None
-        self.c1x1_w_s2 = None
-        if self.has_proj:
-            # Using different member names here to allow easier parameter key matching for conversion
-            if self.key_stride == 2:
-                self.c1x1_w_s2 = BnActConv2d(
-                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=2)
-            else:
-                self.c1x1_w_s1 = BnActConv2d(
-                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1)
-
-        self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1)
-        self.c3x3_b = BnActConv2d(
-            in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3, stride=self.key_stride, groups=groups)
-        if b:
-            self.c1x1_c = CatBnAct(in_chs=num_3x3_b)
-            self.c1x1_c1 = create_conv2d(num_3x3_b, num_1x1_c, kernel_size=1)
-            self.c1x1_c2 = create_conv2d(num_3x3_b, inc, kernel_size=1)
-        else:
-            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1)
-            self.c1x1_c1 = None
-            self.c1x1_c2 = None
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]
-        pass
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
-        pass
-
-    def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor]:
-        if isinstance(x, tuple):
-            x_in = torch.cat(x, dim=1)
-        else:
-            x_in = x
-        if self.c1x1_w_s1 is None and self.c1x1_w_s2 is None:
-            # self.has_proj == False, torchscript requires condition on module == None
-            x_s1 = x[0]
-            x_s2 = x[1]
-        else:
-            # self.has_proj == True
-            if self.c1x1_w_s1 is not None:
-                # self.key_stride = 1
-                x_s = self.c1x1_w_s1(x_in)
-            else:
-                # self.key_stride = 2
-                x_s = self.c1x1_w_s2(x_in)
-            x_s1 = x_s[:, :self.num_1x1_c, :, :]
-            x_s2 = x_s[:, self.num_1x1_c:, :, :]
-        x_in = self.c1x1_a(x_in)
-        x_in = self.c3x3_b(x_in)
-        x_in = self.c1x1_c(x_in)
-        if self.c1x1_c1 is not None:
-            # self.b == True, using None check for torchscript compat
-            out1 = self.c1x1_c1(x_in)
-            out2 = self.c1x1_c2(x_in)
-        else:
-            out1 = x_in[:, :self.num_1x1_c, :, :]
-            out2 = x_in[:, self.num_1x1_c:, :, :]
-        resid = x_s1 + out1
-        dense = torch.cat([x_s2, out2], dim=1)
-        return resid, dense
-
-
-class DPN(nn.Module):
-    def __init__(self, small=False, num_init_features=64, k_r=96, groups=32,
-                 b=False, k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), output_stride=32,
-                 num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg', fc_act=nn.ELU):
-        super(DPN, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        self.b = b
-        assert output_stride == 32  # FIXME look into dilation support
-        norm_layer = partial(BatchNormAct2d, eps=.001)
-        fc_norm_layer = partial(BatchNormAct2d, eps=.001, act_layer=fc_act, inplace=False)
-        bw_factor = 1 if small else 4
-        blocks = OrderedDict()
-
-        # conv1
-        blocks['conv1_1'] = ConvBnAct(
-            in_chans, num_init_features, kernel_size=3 if small else 7, stride=2, norm_layer=norm_layer)
-        blocks['conv1_pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.feature_info = [dict(num_chs=num_init_features, reduction=2, module='features.conv1_1')]
-
-        # conv2
-        bw = 64 * bw_factor
-        inc = inc_sec[0]
-        r = (k_r * bw) // (64 * bw_factor)
-        blocks['conv2_1'] = DualPathBlock(num_init_features, r, r, bw, inc, groups, 'proj', b)
-        in_chs = bw + 3 * inc
-        for i in range(2, k_sec[0] + 1):
-            blocks['conv2_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
-            in_chs += inc
-        self.feature_info += [dict(num_chs=in_chs, reduction=4, module=f'features.conv2_{k_sec[0]}')]
-
-        # conv3
-        bw = 128 * bw_factor
-        inc = inc_sec[1]
-        r = (k_r * bw) // (64 * bw_factor)
-        blocks['conv3_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
-        in_chs = bw + 3 * inc
-        for i in range(2, k_sec[1] + 1):
-            blocks['conv3_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
-            in_chs += inc
-        self.feature_info += [dict(num_chs=in_chs, reduction=8, module=f'features.conv3_{k_sec[1]}')]
-
-        # conv4
-        bw = 256 * bw_factor
-        inc = inc_sec[2]
-        r = (k_r * bw) // (64 * bw_factor)
-        blocks['conv4_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
-        in_chs = bw + 3 * inc
-        for i in range(2, k_sec[2] + 1):
-            blocks['conv4_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
-            in_chs += inc
-        self.feature_info += [dict(num_chs=in_chs, reduction=16, module=f'features.conv4_{k_sec[2]}')]
-
-        # conv5
-        bw = 512 * bw_factor
-        inc = inc_sec[3]
-        r = (k_r * bw) // (64 * bw_factor)
-        blocks['conv5_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b)
-        in_chs = bw + 3 * inc
-        for i in range(2, k_sec[3] + 1):
-            blocks['conv5_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b)
-            in_chs += inc
-        self.feature_info += [dict(num_chs=in_chs, reduction=32, module=f'features.conv5_{k_sec[3]}')]
-
-        blocks['conv5_bn_ac'] = CatBnAct(in_chs, norm_layer=fc_norm_layer)
-
-        self.num_features = in_chs
-        self.features = nn.Sequential(blocks)
-
-        # Using 1x1 conv for the FC layer to allow the extra pooling scheme
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool, use_conv=True)
-
-    def forward_features(self, x):
-        return self.features(x)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.classifier(x)
-        if not self.global_pool.is_identity():
-            x = x.flatten(1)  # conv classifier, flatten if pooling isn't pass-through (disabled)
-        return x
-
-
-def _create_dpn(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        DPN, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(feature_concat=True, flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def dpn68(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        small=True, num_init_features=10, k_r=128, groups=32,
-        k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs)
-    return _create_dpn('dpn68', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def dpn68b(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        small=True, num_init_features=10, k_r=128, groups=32,
-        b=True, k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs)
-    return _create_dpn('dpn68b', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def dpn92(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        num_init_features=64, k_r=96, groups=32,
-        k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), **kwargs)
-    return _create_dpn('dpn92', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def dpn98(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        num_init_features=96, k_r=160, groups=40,
-        k_sec=(3, 6, 20, 3), inc_sec=(16, 32, 32, 128), **kwargs)
-    return _create_dpn('dpn98', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def dpn131(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        num_init_features=128, k_r=160, groups=40,
-        k_sec=(4, 8, 28, 3), inc_sec=(16, 32, 32, 128), **kwargs)
-    return _create_dpn('dpn131', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def dpn107(pretrained=False, **kwargs):
-    model_kwargs = dict(
-        num_init_features=128, k_r=200, groups=50,
-        k_sec=(4, 8, 20, 3), inc_sec=(20, 64, 64, 128), **kwargs)
-    return _create_dpn('dpn107', pretrained=pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/efficientnet.py b/AVLFormer/src/timm/models/efficientnet.py
deleted file mode 100644
index 4fa5348..0000000
--- a/AVLFormer/src/timm/models/efficientnet.py
+++ /dev/null
@@ -1,1797 +0,0 @@
-""" PyTorch EfficientNet Family
-
-An implementation of EfficienNet that covers variety of related models with efficient architectures:
-
-* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent weight ports)
-  - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
-  - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
-  - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
-  - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252
-
-* MixNet (Small, Medium, and Large)
-  - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
-
-* MNasNet B1, A1 (SE), Small
-  - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
-
-* FBNet-C
-  - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
-
-* Single-Path NAS Pixel1
-  - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
-
-* And likely more...
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from typing import List
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
-from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
-from .features import FeatureInfo, FeatureHooks
-from .helpers import build_model_with_cfg, default_cfg_for_features
-from .layers import create_conv2d, create_classifier
-from .registry import register_model
-
-__all__ = ['EfficientNet']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv_stem', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'mnasnet_050': _cfg(url=''),
-    'mnasnet_075': _cfg(url=''),
-    'mnasnet_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth'),
-    'mnasnet_140': _cfg(url=''),
-
-    'semnasnet_050': _cfg(url=''),
-    'semnasnet_075': _cfg(url=''),
-    'semnasnet_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth'),
-    'semnasnet_140': _cfg(url=''),
-    'mnasnet_small': _cfg(url=''),
-
-    'mobilenetv2_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth'),
-    'mobilenetv2_110d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth'),
-    'mobilenetv2_120d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth'),
-    'mobilenetv2_140': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth'),
-
-    'fbnetc_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
-        interpolation='bilinear'),
-    'spnasnet_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth',
-        interpolation='bilinear'),
-
-    'efficientnet_b0': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'),
-    'efficientnet_b1': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8)),
-    'efficientnet_b2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
-        input_size=(3, 260, 260), pool_size=(9, 9)),
-    'efficientnet_b2a': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
-        input_size=(3, 288, 288), pool_size=(9, 9), crop_pct=1.0),
-    'efficientnet_b3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-    'efficientnet_b3a': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
-        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
-    'efficientnet_b4': _cfg(
-        url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
-    'efficientnet_b5': _cfg(
-        url='', input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
-    'efficientnet_b6': _cfg(
-        url='', input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
-    'efficientnet_b7': _cfg(
-        url='', input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
-    'efficientnet_b8': _cfg(
-        url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
-    'efficientnet_l2': _cfg(
-        url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
-
-    'efficientnet_es': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'),
-    'efficientnet_em': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_em_ra2-66250f76.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'efficientnet_el': _cfg(
-        url='https://github.com/DeGirum/pruned-models/releases/download/efficientnet_v1.0/efficientnet_el.pth', 
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-
-    'efficientnet_es_pruned': _cfg(
-        url='https://github.com/DeGirum/pruned-models/releases/download/efficientnet_v1.0/efficientnet_es_pruned75.pth'),
-    'efficientnet_el_pruned': _cfg(
-        url='https://github.com/DeGirum/pruned-models/releases/download/efficientnet_v1.0/efficientnet_el_pruned70.pth', 
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-
-    'efficientnet_cc_b0_4e': _cfg(url=''),
-    'efficientnet_cc_b0_8e': _cfg(url=''),
-    'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-
-    'efficientnet_lite0': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth'),
-    'efficientnet_lite1': _cfg(
-        url='',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'efficientnet_lite2': _cfg(
-        url='',
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
-    'efficientnet_lite3': _cfg(
-        url='',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-    'efficientnet_lite4': _cfg(
-        url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
-
-    'efficientnet_b1_pruned': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb1_pruned_9ebb3fe6.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'efficientnet_b2_pruned': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb2_pruned_203f55bc.pth',
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'efficientnet_b3_pruned': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb3_pruned_5abcc29f.pth',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-
-    'efficientnet_v2s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_v2s_ra2-b265c1ba.pth',
-        input_size=(3, 224, 224), test_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0),  # FIXME WIP
-
-    'tf_efficientnet_b0': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
-        input_size=(3, 224, 224)),
-    'tf_efficientnet_b1': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'tf_efficientnet_b2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth',
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
-    'tf_efficientnet_b3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-    'tf_efficientnet_b4': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
-        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
-    'tf_efficientnet_b5': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
-        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
-    'tf_efficientnet_b6': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
-        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
-    'tf_efficientnet_b7': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
-        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
-    'tf_efficientnet_b8': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth',
-        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
-
-    'tf_efficientnet_b0_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, input_size=(3, 224, 224)),
-    'tf_efficientnet_b1_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'tf_efficientnet_b2_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
-    'tf_efficientnet_b3_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-    'tf_efficientnet_b4_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
-    'tf_efficientnet_b5_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
-    'tf_efficientnet_b6_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
-    'tf_efficientnet_b7_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
-    'tf_efficientnet_b8_ap': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
-
-    'tf_efficientnet_b0_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth',
-        input_size=(3, 224, 224)),
-    'tf_efficientnet_b1_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth',
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'tf_efficientnet_b2_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth',
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
-    'tf_efficientnet_b3_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth',
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-    'tf_efficientnet_b4_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth',
-        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
-    'tf_efficientnet_b5_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth',
-        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
-    'tf_efficientnet_b6_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth',
-        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
-    'tf_efficientnet_b7_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth',
-        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
-    'tf_efficientnet_l2_ns_475': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth',
-        input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936),
-    'tf_efficientnet_l2_ns': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
-        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96),
-
-    'tf_efficientnet_es': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 224, 224), ),
-    'tf_efficientnet_em': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-    'tf_efficientnet_el': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
-
-    'tf_efficientnet_cc_b0_4e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_efficientnet_cc_b0_8e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_efficientnet_cc_b1_8e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
-
-    'tf_efficientnet_lite0': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
-    ),
-    'tf_efficientnet_lite1': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882,
-        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
-    ),
-    'tf_efficientnet_lite2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890,
-        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
-    ),
-    'tf_efficientnet_lite3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, interpolation='bilinear'),
-    'tf_efficientnet_lite4': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.920, interpolation='bilinear'),
-
-    'mixnet_s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth'),
-    'mixnet_m': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth'),
-    'mixnet_l': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth'),
-    'mixnet_xl': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth'),
-    'mixnet_xxl': _cfg(),
-
-    'tf_mixnet_s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth'),
-    'tf_mixnet_m': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth'),
-    'tf_mixnet_l': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth'),
-}
-
-_DEBUG = False
-
-
-class EfficientNet(nn.Module):
-    """ (Generic) EfficientNet
-
-    A flexible and performant PyTorch implementation of efficient network architectures, including:
-      * EfficientNet B0-B8, L2
-      * EfficientNet-EdgeTPU
-      * EfficientNet-CondConv
-      * MixNet S, M, L, XL
-      * MnasNet A1, B1, and small
-      * FBNet C
-      * Single-Path NAS Pixel1
-
-    """
-
-    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
-                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', fix_stem=False, act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
-        super(EfficientNet, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-
-        self.num_classes = num_classes
-        self.num_features = num_features
-        self.drop_rate = drop_rate
-
-        # Stem
-        if not fix_stem:
-            stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Middle stages (IR/ER/DS Blocks)
-        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
-        self.blocks = nn.Sequential(*builder(stem_size, block_args))
-        self.feature_info = builder.features
-        head_chs = builder.in_chs
-
-        # Head + Pooling
-        self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type)
-        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-        efficientnet_init_weights(self)
-
-    def as_sequential(self):
-        layers = [self.conv_stem, self.bn1, self.act1]
-        layers.extend(self.blocks)
-        layers.extend([self.conv_head, self.bn2, self.act2, self.global_pool])
-        layers.extend([nn.Dropout(self.drop_rate), self.classifier])
-        return nn.Sequential(*layers)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.blocks(x)
-        x = self.conv_head(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        return self.classifier(x)
-
-
-class EfficientNetFeatures(nn.Module):
-    """ EfficientNet Feature Extractor
-
-    A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation
-    and object detection models.
-    """
-
-    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck',
-                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', fix_stem=False, act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
-        super(EfficientNetFeatures, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        self.drop_rate = drop_rate
-
-        # Stem
-        if not fix_stem:
-            stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Middle stages (IR/ER/DS Blocks)
-        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
-        self.blocks = nn.Sequential(*builder(stem_size, block_args))
-        self.feature_info = FeatureInfo(builder.features, out_indices)
-        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
-
-        efficientnet_init_weights(self)
-
-        # Register feature extraction hooks with FeatureHooks helper
-        self.feature_hooks = None
-        if feature_location != 'bottleneck':
-            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
-            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
-
-    def forward(self, x) -> List[torch.Tensor]:
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        if self.feature_hooks is None:
-            features = []
-            if 0 in self._stage_out_idx:
-                features.append(x)  # add stem out
-            for i, b in enumerate(self.blocks):
-                x = b(x)
-                if i + 1 in self._stage_out_idx:
-                    features.append(x)
-            return features
-        else:
-            self.blocks(x)
-            out = self.feature_hooks.get_output(x.device)
-            return list(out.values())
-
-
-def _create_effnet(variant, pretrained=False, **kwargs):
-    features_only = False
-    model_cls = EfficientNet
-    kwargs_filter = None
-    if kwargs.pop('features_only', False):
-        features_only = True
-        kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'global_pool')
-        model_cls = EfficientNetFeatures
-    model = build_model_with_cfg(
-        model_cls, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        pretrained_strict=not features_only,
-        kwargs_filter=kwargs_filter,
-        **kwargs)
-    if features_only:
-        model.default_cfg = default_cfg_for_features(model.default_cfg)
-    return model
-
-
-def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a mnasnet-a1 model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
-    Paper: https://arxiv.org/pdf/1807.11626.pdf.
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_e1_c16_noskip'],
-        # stage 1, 112x112 in
-        ['ir_r2_k3_s2_e6_c24'],
-        # stage 2, 56x56 in
-        ['ir_r3_k5_s2_e3_c40_se0.25'],
-        # stage 3, 28x28 in
-        ['ir_r4_k3_s2_e6_c80'],
-        # stage 4, 14x14in
-        ['ir_r2_k3_s1_e6_c112_se0.25'],
-        # stage 5, 14x14in
-        ['ir_r3_k5_s2_e6_c160_se0.25'],
-        # stage 6, 7x7 in
-        ['ir_r1_k3_s1_e6_c320'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a mnasnet-b1 model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
-    Paper: https://arxiv.org/pdf/1807.11626.pdf.
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_c16_noskip'],
-        # stage 1, 112x112 in
-        ['ir_r3_k3_s2_e3_c24'],
-        # stage 2, 56x56 in
-        ['ir_r3_k5_s2_e3_c40'],
-        # stage 3, 28x28 in
-        ['ir_r3_k5_s2_e6_c80'],
-        # stage 4, 14x14in
-        ['ir_r2_k3_s1_e6_c96'],
-        # stage 5, 14x14in
-        ['ir_r4_k5_s2_e6_c192'],
-        # stage 6, 7x7 in
-        ['ir_r1_k3_s1_e6_c320_noskip']
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a mnasnet-b1 model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
-    Paper: https://arxiv.org/pdf/1807.11626.pdf.
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        ['ds_r1_k3_s1_c8'],
-        ['ir_r1_k3_s2_e3_c16'],
-        ['ir_r2_k3_s2_e6_c16'],
-        ['ir_r4_k5_s2_e6_c32_se0.25'],
-        ['ir_r3_k3_s1_e6_c32_se0.25'],
-        ['ir_r3_k5_s2_e6_c88_se0.25'],
-        ['ir_r1_k3_s1_e6_c144']
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        stem_size=8,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mobilenet_v2(
-        variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
-    """ Generate MobileNet-V2 network
-    Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
-    Paper: https://arxiv.org/abs/1801.04381
-    """
-    arch_def = [
-        ['ds_r1_k3_s1_c16'],
-        ['ir_r2_k3_s2_e6_c24'],
-        ['ir_r3_k3_s2_e6_c32'],
-        ['ir_r4_k3_s2_e6_c64'],
-        ['ir_r3_k3_s1_e6_c96'],
-        ['ir_r3_k3_s2_e6_c160'],
-        ['ir_r1_k3_s1_e6_c320'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
-        num_features=1280 if fix_stem_head else round_channels(1280, channel_multiplier, 8, None),
-        stem_size=32,
-        fix_stem=fix_stem_head,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'relu6'),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """ FBNet-C
-
-        Paper: https://arxiv.org/abs/1812.03443
-        Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
-
-        NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
-        it was used to confirm some building block details
-    """
-    arch_def = [
-        ['ir_r1_k3_s1_e1_c16'],
-        ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
-        ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
-        ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
-        ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
-        ['ir_r4_k5_s2_e6_c184'],
-        ['ir_r1_k3_s1_e6_c352'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        stem_size=16,
-        num_features=1984,  # paper suggests this, but is not 100% clear
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
-
-    Paper: https://arxiv.org/abs/1904.02877
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_c16_noskip'],
-        # stage 1, 112x112 in
-        ['ir_r3_k3_s2_e3_c24'],
-        # stage 2, 56x56 in
-        ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
-        # stage 3, 28x28 in
-        ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
-        # stage 4, 14x14in
-        ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
-        # stage 5, 14x14in
-        ['ir_r4_k5_s2_e6_c192'],
-        # stage 6, 7x7 in
-        ['ir_r1_k3_s1_e6_c320_noskip']
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates an EfficientNet model.
-
-    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
-    Paper: https://arxiv.org/abs/1905.11946
-
-    EfficientNet params
-    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
-    'efficientnet-b0': (1.0, 1.0, 224, 0.2),
-    'efficientnet-b1': (1.0, 1.1, 240, 0.2),
-    'efficientnet-b2': (1.1, 1.2, 260, 0.3),
-    'efficientnet-b3': (1.2, 1.4, 300, 0.3),
-    'efficientnet-b4': (1.4, 1.8, 380, 0.4),
-    'efficientnet-b5': (1.6, 2.2, 456, 0.4),
-    'efficientnet-b6': (1.8, 2.6, 528, 0.5),
-    'efficientnet-b7': (2.0, 3.1, 600, 0.5),
-    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
-    'efficientnet-l2': (4.3, 5.3, 800, 0.5),
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer
-      depth_multiplier: multiplier to number of repeats per stage
-
-    """
-    arch_def = [
-        ['ds_r1_k3_s1_e1_c16_se0.25'],
-        ['ir_r2_k3_s2_e6_c24_se0.25'],
-        ['ir_r2_k5_s2_e6_c40_se0.25'],
-        ['ir_r3_k3_s2_e6_c80_se0.25'],
-        ['ir_r3_k5_s1_e6_c112_se0.25'],
-        ['ir_r4_k5_s2_e6_c192_se0.25'],
-        ['ir_r1_k3_s1_e6_c320_se0.25'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
-        num_features=round_channels(1280, channel_multiplier, 8, None),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        act_layer=resolve_act_layer(kwargs, 'swish'),
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs,
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
-    """ Creates an EfficientNet-EdgeTPU model
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
-    """
-
-    arch_def = [
-        # NOTE `fc` is present to override a mismatch between stem channels and in chs not
-        # present in other models
-        ['er_r1_k3_s1_e4_c24_fc24_noskip'],
-        ['er_r2_k3_s2_e8_c32'],
-        ['er_r4_k3_s2_e8_c48'],
-        ['ir_r5_k5_s2_e8_c96'],
-        ['ir_r4_k5_s1_e8_c144'],
-        ['ir_r2_k5_s2_e8_c192'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
-        num_features=round_channels(1280, channel_multiplier, 8, None),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'relu'),
-        **kwargs,
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_efficientnet_condconv(
-        variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
-    """Creates an EfficientNet-CondConv model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
-    """
-    arch_def = [
-        ['ds_r1_k3_s1_e1_c16_se0.25'],
-        ['ir_r2_k3_s2_e6_c24_se0.25'],
-        ['ir_r2_k5_s2_e6_c40_se0.25'],
-        ['ir_r3_k3_s2_e6_c80_se0.25'],
-        ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
-        ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
-        ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
-    ]
-    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
-    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
-        num_features=round_channels(1280, channel_multiplier, 8, None),
-        stem_size=32,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'swish'),
-        **kwargs,
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates an EfficientNet-Lite model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
-    Paper: https://arxiv.org/abs/1905.11946
-
-    EfficientNet params
-    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
-      'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
-      'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
-      'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
-      'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
-      'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer
-      depth_multiplier: multiplier to number of repeats per stage
-    """
-    arch_def = [
-        ['ds_r1_k3_s1_e1_c16'],
-        ['ir_r2_k3_s2_e6_c24'],
-        ['ir_r2_k5_s2_e6_c40'],
-        ['ir_r3_k3_s2_e6_c80'],
-        ['ir_r3_k5_s1_e6_c112'],
-        ['ir_r4_k5_s2_e6_c192'],
-        ['ir_r1_k3_s1_e6_c320'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
-        num_features=1280,
-        stem_size=32,
-        fix_stem=True,
-        channel_multiplier=channel_multiplier,
-        act_layer=resolve_act_layer(kwargs, 'relu6'),
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs,
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_efficientnet_v2s(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
-    """ Creates an EfficientNet-V2s model
-
-    NOTE: this is a preliminary definition based on paper, awaiting official code release for details
-    and weights
-
-    Ref impl:
-    Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298
-    """
-
-    arch_def = [
-        # FIXME it's not clear if the FusedMBConv layers have SE enabled for the Small variant,
-        # Table 4 suggests no. 23.94M params w/o, 23.98 with which is closer to 24M.
-        # ['er_r2_k3_s1_e1_c24_se0.25'],
-        # ['er_r4_k3_s2_e4_c48_se0.25'],
-        # ['er_r4_k3_s2_e4_c64_se0.25'],
-        ['er_r2_k3_s1_e1_c24'],
-        ['er_r4_k3_s2_e4_c48'],
-        ['er_r4_k3_s2_e4_c64'],
-        ['ir_r6_k3_s2_e4_c128_se0.25'],
-        ['ir_r9_k3_s1_e6_c160_se0.25'],
-        ['ir_r15_k3_s2_e6_c272_se0.25'],
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
-        num_features=round_channels(1792, channel_multiplier, 8, None),
-        stem_size=24,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'silu'),  # FIXME this is an assumption, paper does not mention
-        **kwargs,
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a MixNet Small model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
-    Paper: https://arxiv.org/abs/1907.09595
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_e1_c16'],  # relu
-        # stage 1, 112x112 in
-        ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'],  # relu
-        # stage 2, 56x56 in
-        ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
-        # stage 3, 28x28 in
-        ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
-        # stage 4, 14x14in
-        ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
-        # stage 5, 14x14in
-        ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
-        # 7x7
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        num_features=1536,
-        stem_size=16,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a MixNet Medium-Large model.
-
-    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
-    Paper: https://arxiv.org/abs/1907.09595
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_e1_c24'],  # relu
-        # stage 1, 112x112 in
-        ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'],  # relu
-        # stage 2, 56x56 in
-        ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
-        # stage 3, 28x28 in
-        ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
-        # stage 4, 14x14in
-        ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
-        # stage 5, 14x14in
-        ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
-        # 7x7
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
-        num_features=1536,
-        stem_size=24,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        **kwargs
-    )
-    model = _create_effnet(variant, pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def mnasnet_050(pretrained=False, **kwargs):
-    """ MNASNet B1, depth multiplier of 0.5. """
-    model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mnasnet_075(pretrained=False, **kwargs):
-    """ MNASNet B1, depth multiplier of 0.75. """
-    model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mnasnet_100(pretrained=False, **kwargs):
-    """ MNASNet B1, depth multiplier of 1.0. """
-    model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mnasnet_b1(pretrained=False, **kwargs):
-    """ MNASNet B1, depth multiplier of 1.0. """
-    return mnasnet_100(pretrained, **kwargs)
-
-
-@register_model
-def mnasnet_140(pretrained=False, **kwargs):
-    """ MNASNet B1,  depth multiplier of 1.4 """
-    model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def semnasnet_050(pretrained=False, **kwargs):
-    """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
-    model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def semnasnet_075(pretrained=False, **kwargs):
-    """ MNASNet A1 (w/ SE),  depth multiplier of 0.75. """
-    model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def semnasnet_100(pretrained=False, **kwargs):
-    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
-    model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mnasnet_a1(pretrained=False, **kwargs):
-    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
-    return semnasnet_100(pretrained, **kwargs)
-
-
-@register_model
-def semnasnet_140(pretrained=False, **kwargs):
-    """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
-    model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mnasnet_small(pretrained=False, **kwargs):
-    """ MNASNet Small,  depth multiplier of 1.0. """
-    model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv2_100(pretrained=False, **kwargs):
-    """ MobileNet V2 w/ 1.0 channel multiplier """
-    model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv2_140(pretrained=False, **kwargs):
-    """ MobileNet V2 w/ 1.4 channel multiplier """
-    model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv2_110d(pretrained=False, **kwargs):
-    """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers"""
-    model = _gen_mobilenet_v2(
-        'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv2_120d(pretrained=False, **kwargs):
-    """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """
-    model = _gen_mobilenet_v2(
-        'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def fbnetc_100(pretrained=False, **kwargs):
-    """ FBNet-C """
-    if pretrained:
-        # pretrained model trained with non-default BN epsilon
-        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def spnasnet_100(pretrained=False, **kwargs):
-    """ Single-Path NAS Pixel1"""
-    model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b0(pretrained=False, **kwargs):
-    """ EfficientNet-B0 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b1(pretrained=False, **kwargs):
-    """ EfficientNet-B1 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b2(pretrained=False, **kwargs):
-    """ EfficientNet-B2 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b2a(pretrained=False, **kwargs):
-    """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop"""
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b2a', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b3(pretrained=False, **kwargs):
-    """ EfficientNet-B3 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b3a(pretrained=False, **kwargs):
-    """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b3a', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b4(pretrained=False, **kwargs):
-    """ EfficientNet-B4 """
-    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b5(pretrained=False, **kwargs):
-    """ EfficientNet-B5 """
-    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b6(pretrained=False, **kwargs):
-    """ EfficientNet-B6 """
-    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b7(pretrained=False, **kwargs):
-    """ EfficientNet-B7 """
-    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b8(pretrained=False, **kwargs):
-    """ EfficientNet-B8 """
-    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_l2(pretrained=False, **kwargs):
-    """ EfficientNet-L2."""
-    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
-    model = _gen_efficientnet(
-        'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_es(pretrained=False, **kwargs):
-    """ EfficientNet-Edge Small. """
-    model = _gen_efficientnet_edge(
-        'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-@register_model
-def efficientnet_es_pruned(pretrained=False, **kwargs):
-    """ EfficientNet-Edge Small Pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0"""
-    model = _gen_efficientnet_edge(
-        'efficientnet_es_pruned', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-@register_model
-def efficientnet_em(pretrained=False, **kwargs):
-    """ EfficientNet-Edge-Medium. """
-    model = _gen_efficientnet_edge(
-        'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_el(pretrained=False, **kwargs):
-    """ EfficientNet-Edge-Large. """
-    model = _gen_efficientnet_edge(
-        'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-@register_model
-def efficientnet_el_pruned(pretrained=False, **kwargs):
-    """ EfficientNet-Edge-Large pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0"""
-    model = _gen_efficientnet_edge(
-        'efficientnet_el_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-@register_model
-def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet_condconv(
-        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet_condconv(
-        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B1 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet_condconv(
-        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_lite0(pretrained=False, **kwargs):
-    """ EfficientNet-Lite0 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet_lite(
-        'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_lite1(pretrained=False, **kwargs):
-    """ EfficientNet-Lite1 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    model = _gen_efficientnet_lite(
-        'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_lite2(pretrained=False, **kwargs):
-    """ EfficientNet-Lite2 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet_lite(
-        'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_lite3(pretrained=False, **kwargs):
-    """ EfficientNet-Lite3 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    model = _gen_efficientnet_lite(
-        'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_lite4(pretrained=False, **kwargs):
-    """ EfficientNet-Lite4 """
-    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
-    model = _gen_efficientnet_lite(
-        'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b1_pruned(pretrained=False, **kwargs):
-    """ EfficientNet-B1 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    variant = 'efficientnet_b1_pruned'
-    model = _gen_efficientnet(
-        variant, channel_multiplier=1.0, depth_multiplier=1.1, pruned=True, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b2_pruned(pretrained=False, **kwargs):
-    """ EfficientNet-B2 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'efficientnet_b2_pruned', channel_multiplier=1.1, depth_multiplier=1.2, pruned=True,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_b3_pruned(pretrained=False, **kwargs):
-    """ EfficientNet-B3 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'efficientnet_b3_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pruned=True,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def efficientnet_v2s(pretrained=False, **kwargs):
-    """ EfficientNet-V2 Small. """
-    model = _gen_efficientnet_v2s(
-        'efficientnet_v2s', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-
-@register_model
-def tf_efficientnet_b0(pretrained=False, **kwargs):
-    """ EfficientNet-B0. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b1(pretrained=False, **kwargs):
-    """ EfficientNet-B1. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b2(pretrained=False, **kwargs):
-    """ EfficientNet-B2. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b3(pretrained=False, **kwargs):
-    """ EfficientNet-B3. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b4(pretrained=False, **kwargs):
-    """ EfficientNet-B4. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b5(pretrained=False, **kwargs):
-    """ EfficientNet-B5. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b6(pretrained=False, **kwargs):
-    """ EfficientNet-B6. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b7(pretrained=False, **kwargs):
-    """ EfficientNet-B7. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b8(pretrained=False, **kwargs):
-    """ EfficientNet-B8. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b0_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B0 AdvProp. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b1_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B1 AdvProp. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b2_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B2 AdvProp. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b3_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B3 AdvProp. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b4_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B4 AdvProp. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b5_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B5 AdvProp. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b6_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B6 AdvProp. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b7_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B7 AdvProp. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b8_ap(pretrained=False, **kwargs):
-    """ EfficientNet-B8 AdvProp. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b0_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b1_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b2_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b3_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b4_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b5_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b6_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_b7_ns(pretrained=False, **kwargs):
-    """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs):
-    """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_l2_ns(pretrained=False, **kwargs):
-    """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.5
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet(
-        'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_es(pretrained=False, **kwargs):
-    """ EfficientNet-Edge Small. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_edge(
-        'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_em(pretrained=False, **kwargs):
-    """ EfficientNet-Edge-Medium. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_edge(
-        'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_el(pretrained=False, **kwargs):
-    """ EfficientNet-Edge-Large. Tensorflow compatible variant  """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_edge(
-        'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_condconv(
-        'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_condconv(
-        'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
-    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_condconv(
-        'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
-        pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_lite0(pretrained=False, **kwargs):
-    """ EfficientNet-Lite0 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_lite(
-        'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_lite1(pretrained=False, **kwargs):
-    """ EfficientNet-Lite1 """
-    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_lite(
-        'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_lite2(pretrained=False, **kwargs):
-    """ EfficientNet-Lite2 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_lite(
-        'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_lite3(pretrained=False, **kwargs):
-    """ EfficientNet-Lite3 """
-    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_lite(
-        'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_efficientnet_lite4(pretrained=False, **kwargs):
-    """ EfficientNet-Lite4 """
-    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_efficientnet_lite(
-        'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mixnet_s(pretrained=False, **kwargs):
-    """Creates a MixNet Small model.
-    """
-    model = _gen_mixnet_s(
-        'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mixnet_m(pretrained=False, **kwargs):
-    """Creates a MixNet Medium model.
-    """
-    model = _gen_mixnet_m(
-        'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mixnet_l(pretrained=False, **kwargs):
-    """Creates a MixNet Large model.
-    """
-    model = _gen_mixnet_m(
-        'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mixnet_xl(pretrained=False, **kwargs):
-    """Creates a MixNet Extra-Large model.
-    Not a paper spec, experimental def by RW w/ depth scaling.
-    """
-    model = _gen_mixnet_m(
-        'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mixnet_xxl(pretrained=False, **kwargs):
-    """Creates a MixNet Double Extra Large model.
-    Not a paper spec, experimental def by RW w/ depth scaling.
-    """
-    model = _gen_mixnet_m(
-        'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mixnet_s(pretrained=False, **kwargs):
-    """Creates a MixNet Small model. Tensorflow compatible variant
-    """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mixnet_s(
-        'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mixnet_m(pretrained=False, **kwargs):
-    """Creates a MixNet Medium model. Tensorflow compatible variant
-    """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mixnet_m(
-        'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mixnet_l(pretrained=False, **kwargs):
-    """Creates a MixNet Large model. Tensorflow compatible variant
-    """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mixnet_m(
-        'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/efficientnet_blocks.py b/AVLFormer/src/timm/models/efficientnet_blocks.py
deleted file mode 100644
index 114533c..0000000
--- a/AVLFormer/src/timm/models/efficientnet_blocks.py
+++ /dev/null
@@ -1,413 +0,0 @@
-""" EfficientNet, MobileNetV3, etc Blocks
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from .layers import create_conv2d, drop_path, get_act_layer
-from .layers.activations import sigmoid
-
-# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
-# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
-# NOTE: momentum varies btw .99 and .9997 depending on source
-# .99 in official TF TPU impl
-# .9997 (/w .999 in search space) for paper
-BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
-BN_EPS_TF_DEFAULT = 1e-3
-_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
-
-
-def get_bn_args_tf():
-    return _BN_ARGS_TF.copy()
-
-
-def resolve_bn_args(kwargs):
-    bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
-    bn_momentum = kwargs.pop('bn_momentum', None)
-    if bn_momentum is not None:
-        bn_args['momentum'] = bn_momentum
-    bn_eps = kwargs.pop('bn_eps', None)
-    if bn_eps is not None:
-        bn_args['eps'] = bn_eps
-    return bn_args
-
-
-_SE_ARGS_DEFAULT = dict(
-    gate_fn=sigmoid,
-    act_layer=None,
-    reduce_mid=False,
-    divisor=1)
-
-
-def resolve_se_args(kwargs, in_chs, act_layer=None):
-    se_kwargs = kwargs.copy() if kwargs is not None else {}
-    # fill in args that aren't specified with the defaults
-    for k, v in _SE_ARGS_DEFAULT.items():
-        se_kwargs.setdefault(k, v)
-    # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
-    if not se_kwargs.pop('reduce_mid'):
-        se_kwargs['reduced_base_chs'] = in_chs
-    # act_layer override, if it remains None, the containing block's act_layer will be used
-    if se_kwargs['act_layer'] is None:
-        assert act_layer is not None
-        se_kwargs['act_layer'] = act_layer
-    return se_kwargs
-
-
-def resolve_act_layer(kwargs, default='relu'):
-    act_layer = kwargs.pop('act_layer', default)
-    if isinstance(act_layer, str):
-        act_layer = get_act_layer(act_layer)
-    return act_layer
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    min_value = min_value or divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
-    """Round number of filters based on depth multiplier."""
-    if not multiplier:
-        return channels
-    channels *= multiplier
-    return make_divisible(channels, divisor, channel_min)
-
-
-class ChannelShuffle(nn.Module):
-    # FIXME haven't used yet
-    def __init__(self, groups):
-        super(ChannelShuffle, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
-        N, C, H, W = x.size()
-        g = self.groups
-        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
-            g, C
-        )
-        return (
-            x.view(N, g, int(C / g), H, W)
-            .permute(0, 2, 1, 3, 4)
-            .contiguous()
-            .view(N, C, H, W)
-        )
-
-
-class SqueezeExcite(nn.Module):
-    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
-                 act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1, **_):
-        super(SqueezeExcite, self).__init__()
-        reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
-        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
-        self.act1 = act_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
-        self.gate_fn = gate_fn
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.conv_reduce(x_se)
-        x_se = self.act1(x_se)
-        x_se = self.conv_expand(x_se)
-        return x * self.gate_fn(x_se)
-
-
-class ConvBnAct(nn.Module):
-    def __init__(self, in_chs, out_chs, kernel_size,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
-        super(ConvBnAct, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn1 = norm_layer(out_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-    def feature_info(self, location):
-        if location == 'expansion':  # output of conv after act, same as block coutput
-            info = dict(module='act1', hook_type='forward', num_chs=self.conv.out_channels)
-        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv.out_channels)
-        return info
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        return x
-
-
-class DepthwiseSeparableConv(nn.Module):
-    """ DepthwiseSeparable block
-    Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion
-    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
-    """
-    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0.):
-        super(DepthwiseSeparableConv, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
-        self.has_pw_act = pw_act  # activation after point-wise conv
-        self.drop_path_rate = drop_path_rate
-
-        self.conv_dw = create_conv2d(
-            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
-        self.bn1 = norm_layer(in_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
-        else:
-            self.se = None
-
-        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_layer(out_chs, **norm_kwargs)
-        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PW
-            info = dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
-        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pw.out_channels)
-        return info
-
-    def forward(self, x):
-        residual = x
-
-        x = self.conv_dw(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        if self.se is not None:
-            x = self.se(x)
-
-        x = self.conv_pw(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += residual
-        return x
-
-
-class InvertedResidual(nn.Module):
-    """ Inverted residual block w/ optional SE
-
-    Originally used in MobileNet-V2 - https://arxiv.org/abs/1801.04381v4, this layer is often
-    referred to as 'MBConv' for (Mobile inverted bottleneck conv) and is also used in
-      * MNasNet - https://arxiv.org/abs/1807.11626
-      * EfficientNet - https://arxiv.org/abs/1905.11946
-      * MobileNet-V3 - https://arxiv.org/abs/1905.02244
-    """
-
-    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 conv_kwargs=None, drop_path_rate=0.):
-        super(InvertedResidual, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        conv_kwargs = conv_kwargs or {}
-        mid_chs = make_divisible(in_chs * exp_ratio)
-        has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_path_rate = drop_path_rate
-
-        # Point-wise expansion
-        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Depth-wise convolution
-        self.conv_dw = create_conv2d(
-            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
-            padding=pad_type, depthwise=True, **conv_kwargs)
-        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
-        self.act2 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
-        else:
-            self.se = None
-
-        # Point-wise linear projection
-        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn3 = norm_layer(out_chs, **norm_kwargs)
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, input to PWL
-            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
-        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
-        return info
-
-    def forward(self, x):
-        residual = x
-
-        # Point-wise expansion
-        x = self.conv_pw(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Depth-wise convolution
-        x = self.conv_dw(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Squeeze-and-excitation
-        if self.se is not None:
-            x = self.se(x)
-
-        # Point-wise linear projection
-        x = self.conv_pwl(x)
-        x = self.bn3(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += residual
-
-        return x
-
-
-class CondConvResidual(InvertedResidual):
-    """ Inverted residual block w/ CondConv routing"""
-
-    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
-                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 num_experts=0, drop_path_rate=0.):
-
-        self.num_experts = num_experts
-        conv_kwargs = dict(num_experts=self.num_experts)
-
-        super(CondConvResidual, self).__init__(
-            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type,
-            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
-            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
-            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
-            drop_path_rate=drop_path_rate)
-
-        self.routing_fn = nn.Linear(in_chs, self.num_experts)
-
-    def forward(self, x):
-        residual = x
-
-        # CondConv routing
-        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
-        routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
-
-        # Point-wise expansion
-        x = self.conv_pw(x, routing_weights)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Depth-wise convolution
-        x = self.conv_dw(x, routing_weights)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Squeeze-and-excitation
-        if self.se is not None:
-            x = self.se(x)
-
-        # Point-wise linear projection
-        x = self.conv_pwl(x, routing_weights)
-        x = self.bn3(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += residual
-        return x
-
-
-class EdgeResidual(nn.Module):
-    """ Residual block with expansion convolution followed by pointwise-linear w/ stride
-
-    Originally introduced in `EfficientNet-EdgeTPU: Creating Accelerator-Optimized Neural Networks with AutoML`
-        - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html
-
-    This layer is also called FusedMBConv in the MobileDet, EfficientNet-X, and EfficientNet-V2 papers
-      * MobileDet - https://arxiv.org/abs/2004.14525
-      * EfficientNet-X - https://arxiv.org/abs/2102.05610
-      * EfficientNet-V2 - https://arxiv.org/abs/2104.00298
-    """
-
-    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
-                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
-                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 drop_path_rate=0.):
-        super(EdgeResidual, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        if fake_in_chs > 0:
-            mid_chs = make_divisible(fake_in_chs * exp_ratio)
-        else:
-            mid_chs = make_divisible(in_chs * exp_ratio)
-        has_se = se_ratio is not None and se_ratio > 0.
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_path_rate = drop_path_rate
-
-        # Expansion convolution
-        self.conv_exp = create_conv2d(
-            in_chs, mid_chs, exp_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Squeeze-and-excitation
-        if has_se:
-            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
-            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
-        else:
-            self.se = None
-
-        # Point-wise linear projection
-        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_layer(out_chs, **norm_kwargs)
-
-    def feature_info(self, location):
-        if location == 'expansion':  # after SE, before PWL
-            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
-        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
-        return info
-
-    def forward(self, x):
-        residual = x
-
-        # Expansion convolution
-        x = self.conv_exp(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Squeeze-and-excitation
-        if self.se is not None:
-            x = self.se(x)
-
-        # Point-wise linear projection
-        x = self.conv_pwl(x)
-        x = self.bn2(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += residual
-
-        return x
diff --git a/AVLFormer/src/timm/models/efficientnet_builder.py b/AVLFormer/src/timm/models/efficientnet_builder.py
deleted file mode 100644
index f670aa6..0000000
--- a/AVLFormer/src/timm/models/efficientnet_builder.py
+++ /dev/null
@@ -1,414 +0,0 @@
-""" EfficientNet, MobileNetV3, etc Builder
-
-Assembles EfficieNet and related network feature blocks from string definitions.
-Handles stride, dilation calculations, and selects feature extraction points.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import logging
-import math
-import re
-from copy import deepcopy
-
-import torch.nn as nn
-
-from .efficientnet_blocks import *
-from .layers import CondConv2d, get_condconv_initializer
-
-__all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights"]
-
-_logger = logging.getLogger(__name__)
-
-
-def _log_info_if(msg, condition):
-    if condition:
-        _logger.info(msg)
-
-
-def _parse_ksize(ss):
-    if ss.isdigit():
-        return int(ss)
-    else:
-        return [int(k) for k in ss.split('.')]
-
-
-def _decode_block_str(block_str):
-    """ Decode block definition string
-
-    Gets a list of block arg (dicts) through a string notation of arguments.
-    E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
-
-    All args can exist in any order with the exception of the leading string which
-    is assumed to indicate the block type.
-
-    leading string - block type (
-      ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
-    r - number of repeat blocks,
-    k - kernel size,
-    s - strides (1-9),
-    e - expansion ratio,
-    c - output channels,
-    se - squeeze/excitation ratio
-    n - activation fn ('re', 'r6', 'hs', or 'sw')
-    Args:
-        block_str: a string representation of block arguments.
-    Returns:
-        A list of block args (dicts)
-    Raises:
-        ValueError: if the string def not properly specified (TODO)
-    """
-    assert isinstance(block_str, str)
-    ops = block_str.split('_')
-    block_type = ops[0]  # take the block type off the front
-    ops = ops[1:]
-    options = {}
-    noskip = False
-    for op in ops:
-        # string options being checked on individual basis, combine if they grow
-        if op == 'noskip':
-            noskip = True
-        elif op.startswith('n'):
-            # activation fn
-            key = op[0]
-            v = op[1:]
-            if v == 're':
-                value = get_act_layer('relu')
-            elif v == 'r6':
-                value = get_act_layer('relu6')
-            elif v == 'hs':
-                value = get_act_layer('hard_swish')
-            elif v == 'sw':
-                value = get_act_layer('swish')
-            else:
-                continue
-            options[key] = value
-        else:
-            # all numeric options
-            splits = re.split(r'(\d.*)', op)
-            if len(splits) >= 2:
-                key, value = splits[:2]
-                options[key] = value
-
-    # if act_layer is None, the model default (passed to model init) will be used
-    act_layer = options['n'] if 'n' in options else None
-    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
-    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
-    fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
-
-    num_repeat = int(options['r'])
-    # each type of block has different valid arguments, fill accordingly
-    if block_type == 'ir':
-        block_args = dict(
-            block_type=block_type,
-            dw_kernel_size=_parse_ksize(options['k']),
-            exp_kernel_size=exp_kernel_size,
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            exp_ratio=float(options['e']),
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            noskip=noskip,
-        )
-        if 'cc' in options:
-            block_args['num_experts'] = int(options['cc'])
-    elif block_type == 'ds' or block_type == 'dsa':
-        block_args = dict(
-            block_type=block_type,
-            dw_kernel_size=_parse_ksize(options['k']),
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            pw_act=block_type == 'dsa',
-            noskip=block_type == 'dsa' or noskip,
-        )
-    elif block_type == 'er':
-        block_args = dict(
-            block_type=block_type,
-            exp_kernel_size=_parse_ksize(options['k']),
-            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
-            exp_ratio=float(options['e']),
-            fake_in_chs=fake_in_chs,
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=int(options['s']),
-            act_layer=act_layer,
-            noskip=noskip,
-        )
-    elif block_type == 'cn':
-        block_args = dict(
-            block_type=block_type,
-            kernel_size=int(options['k']),
-            out_chs=int(options['c']),
-            stride=int(options['s']),
-            act_layer=act_layer,
-        )
-    else:
-        assert False, 'Unknown block type (%s)' % block_type
-
-    return block_args, num_repeat
-
-
-def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
-    """ Per-stage depth scaling
-    Scales the block repeats in each stage. This depth scaling impl maintains
-    compatibility with the EfficientNet scaling method, while allowing sensible
-    scaling for other models that may have multiple block arg definitions in each stage.
-    """
-
-    # We scale the total repeat count for each stage, there may be multiple
-    # block arg defs per stage so we need to sum.
-    num_repeat = sum(repeats)
-    if depth_trunc == 'round':
-        # Truncating to int by rounding allows stages with few repeats to remain
-        # proportionally smaller for longer. This is a good choice when stage definitions
-        # include single repeat stages that we'd prefer to keep that way as long as possible
-        num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
-    else:
-        # The default for EfficientNet truncates repeats to int via 'ceil'.
-        # Any multiplier > 1.0 will result in an increased depth for every stage.
-        num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
-
-    # Proportionally distribute repeat count scaling to each block definition in the stage.
-    # Allocation is done in reverse as it results in the first block being less likely to be scaled.
-    # The first block makes less sense to repeat in most of the arch definitions.
-    repeats_scaled = []
-    for r in repeats[::-1]:
-        rs = max(1, round((r / num_repeat * num_repeat_scaled)))
-        repeats_scaled.append(rs)
-        num_repeat -= r
-        num_repeat_scaled -= rs
-    repeats_scaled = repeats_scaled[::-1]
-
-    # Apply the calculated scaling to each block arg in the stage
-    sa_scaled = []
-    for ba, rep in zip(stack_args, repeats_scaled):
-        sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
-    return sa_scaled
-
-
-def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
-    arch_args = []
-    for stack_idx, block_strings in enumerate(arch_def):
-        assert isinstance(block_strings, list)
-        stack_args = []
-        repeats = []
-        for block_str in block_strings:
-            assert isinstance(block_str, str)
-            ba, rep = _decode_block_str(block_str)
-            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
-                ba['num_experts'] *= experts_multiplier
-            stack_args.append(ba)
-            repeats.append(rep)
-        if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
-            arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
-        else:
-            arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
-    return arch_args
-
-
-class EfficientNetBuilder:
-    """ Build Trunk Blocks
-
-    This ended up being somewhat of a cross between
-    https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
-    and
-    https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
-
-    """
-    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0., feature_location='',
-                 verbose=False):
-        self.channel_multiplier = channel_multiplier
-        self.channel_divisor = channel_divisor
-        self.channel_min = channel_min
-        self.output_stride = output_stride
-        self.pad_type = pad_type
-        self.act_layer = act_layer
-        self.se_kwargs = se_kwargs
-        self.norm_layer = norm_layer
-        self.norm_kwargs = norm_kwargs
-        self.drop_path_rate = drop_path_rate
-        if feature_location == 'depthwise':
-            # old 'depthwise' mode renamed 'expansion' to match TF impl, old expansion mode didn't make sense
-            _logger.warning("feature_location=='depthwise' is deprecated, using 'expansion'")
-            feature_location = 'expansion'
-        self.feature_location = feature_location
-        assert feature_location in ('bottleneck', 'expansion', '')
-        self.verbose = verbose
-
-        # state updated during build, consumed by model
-        self.in_chs = None
-        self.features = []
-
-    def _round_channels(self, chs):
-        return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
-
-    def _make_block(self, ba, block_idx, block_count):
-        drop_path_rate = self.drop_path_rate * block_idx / block_count
-        bt = ba.pop('block_type')
-        ba['in_chs'] = self.in_chs
-        ba['out_chs'] = self._round_channels(ba['out_chs'])
-        if 'fake_in_chs' in ba and ba['fake_in_chs']:
-            # FIXME this is a hack to work around mismatch in origin impl input filters
-            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
-        ba['norm_layer'] = self.norm_layer
-        ba['norm_kwargs'] = self.norm_kwargs
-        ba['pad_type'] = self.pad_type
-        # block act fn overrides the model default
-        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
-        assert ba['act_layer'] is not None
-        if bt == 'ir':
-            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
-            _log_info_if('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
-            if ba.get('num_experts', 0) > 0:
-                block = CondConvResidual(**ba)
-            else:
-                block = InvertedResidual(**ba)
-        elif bt == 'ds' or bt == 'dsa':
-            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
-            _log_info_if('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
-            block = DepthwiseSeparableConv(**ba)
-        elif bt == 'er':
-            ba['drop_path_rate'] = drop_path_rate
-            ba['se_kwargs'] = self.se_kwargs
-            _log_info_if('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
-            block = EdgeResidual(**ba)
-        elif bt == 'cn':
-            _log_info_if('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
-            block = ConvBnAct(**ba)
-        else:
-            assert False, 'Uknkown block type (%s) while building model.' % bt
-        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
-
-        return block
-
-    def __call__(self, in_chs, model_block_args):
-        """ Build the blocks
-        Args:
-            in_chs: Number of input-channels passed to first block
-            model_block_args: A list of lists, outer list defines stages, inner
-                list contains strings defining block configuration(s)
-        Return:
-             List of block stacks (each stack wrapped in nn.Sequential)
-        """
-        _log_info_if('Building model trunk with %d stages...' % len(model_block_args), self.verbose)
-        self.in_chs = in_chs
-        total_block_count = sum([len(x) for x in model_block_args])
-        total_block_idx = 0
-        current_stride = 2
-        current_dilation = 1
-        stages = []
-        if model_block_args[0][0]['stride'] > 1:
-            # if the first block starts with a stride, we need to extract first level feat from stem
-            feature_info = dict(
-                module='act1', num_chs=in_chs, stage=0, reduction=current_stride,
-                hook_type='forward' if self.feature_location != 'bottleneck' else '')
-            self.features.append(feature_info)
-
-        # outer list of block_args defines the stacks
-        for stack_idx, stack_args in enumerate(model_block_args):
-            last_stack = stack_idx + 1 == len(model_block_args)
-            _log_info_if('Stack: {}'.format(stack_idx), self.verbose)
-            assert isinstance(stack_args, list)
-
-            blocks = []
-            # each stack (stage of blocks) contains a list of block arguments
-            for block_idx, block_args in enumerate(stack_args):
-                last_block = block_idx + 1 == len(stack_args)
-                _log_info_if(' Block: {}'.format(block_idx), self.verbose)
-
-                assert block_args['stride'] in (1, 2)
-                if block_idx >= 1:   # only the first block in any stack can have a stride > 1
-                    block_args['stride'] = 1
-
-                extract_features = False
-                if last_block:
-                    next_stack_idx = stack_idx + 1
-                    extract_features = next_stack_idx >= len(model_block_args) or \
-                        model_block_args[next_stack_idx][0]['stride'] > 1
-
-                next_dilation = current_dilation
-                if block_args['stride'] > 1:
-                    next_output_stride = current_stride * block_args['stride']
-                    if next_output_stride > self.output_stride:
-                        next_dilation = current_dilation * block_args['stride']
-                        block_args['stride'] = 1
-                        _log_info_if('  Converting stride to dilation to maintain output_stride=={}'.format(
-                            self.output_stride), self.verbose)
-                    else:
-                        current_stride = next_output_stride
-                block_args['dilation'] = current_dilation
-                if next_dilation != current_dilation:
-                    current_dilation = next_dilation
-
-                # create the block
-                block = self._make_block(block_args, total_block_idx, total_block_count)
-                blocks.append(block)
-
-                # stash feature module name and channel info for model feature extraction
-                if extract_features:
-                    feature_info = dict(
-                        stage=stack_idx + 1, reduction=current_stride, **block.feature_info(self.feature_location))
-                    module_name = f'blocks.{stack_idx}.{block_idx}'
-                    leaf_name = feature_info.get('module', '')
-                    feature_info['module'] = '.'.join([module_name, leaf_name]) if leaf_name else module_name
-                    self.features.append(feature_info)
-
-                total_block_idx += 1  # incr global block idx (across all stacks)
-            stages.append(nn.Sequential(*blocks))
-        return stages
-
-
-def _init_weight_goog(m, n='', fix_group_fanout=True):
-    """ Weight initialization as per Tensorflow official implementations.
-
-    Args:
-        m (nn.Module): module to init
-        n (str): module name
-        fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs
-
-    Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc:
-    * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
-    * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
-    """
-    if isinstance(m, CondConv2d):
-        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-        if fix_group_fanout:
-            fan_out //= m.groups
-        init_weight_fn = get_condconv_initializer(
-            lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
-        init_weight_fn(m.weight)
-        if m.bias is not None:
-            m.bias.data.zero_()
-    elif isinstance(m, nn.Conv2d):
-        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-        if fix_group_fanout:
-            fan_out //= m.groups
-        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-        if m.bias is not None:
-            m.bias.data.zero_()
-    elif isinstance(m, nn.BatchNorm2d):
-        m.weight.data.fill_(1.0)
-        m.bias.data.zero_()
-    elif isinstance(m, nn.Linear):
-        fan_out = m.weight.size(0)  # fan-out
-        fan_in = 0
-        if 'routing_fn' in n:
-            fan_in = m.weight.size(1)
-        init_range = 1.0 / math.sqrt(fan_in + fan_out)
-        m.weight.data.uniform_(-init_range, init_range)
-        m.bias.data.zero_()
-
-
-def efficientnet_init_weights(model: nn.Module, init_fn=None):
-    init_fn = init_fn or _init_weight_goog
-    for n, m in model.named_modules():
-        init_fn(m, n)
-
diff --git a/AVLFormer/src/timm/models/factory.py b/AVLFormer/src/timm/models/factory.py
deleted file mode 100644
index d040a9f..0000000
--- a/AVLFormer/src/timm/models/factory.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from .registry import is_model, is_model_in_modules, model_entrypoint
-from .helpers import load_checkpoint
-from .layers import set_layer_config
-from .hub import load_model_config_from_hf
-
-
-def split_model_name(model_name):
-    model_split = model_name.split(':', 1)
-    if len(model_split) == 1:
-        return '', model_split[0]
-    else:
-        source_name, model_name = model_split
-        assert source_name in ('timm', 'hf_hub')
-        return source_name, model_name
-
-
-def safe_model_name(model_name, remove_source=True):
-    def make_safe(name):
-        return ''.join(c if c.isalnum() else '_' for c in name).rstrip('_')
-    if remove_source:
-        model_name = split_model_name(model_name)[-1]
-    return make_safe(model_name)
-
-
-def create_model(
-        model_name,
-        pretrained=False,
-        checkpoint_path='',
-        scriptable=None,
-        exportable=None,
-        no_jit=None,
-        **kwargs):
-    """Create a model
-
-    Args:
-        model_name (str): name of model to instantiate
-        pretrained (bool): load pretrained ImageNet-1k weights if true
-        checkpoint_path (str): path of checkpoint to load after model is initialized
-        scriptable (bool): set layer config so that model is jit scriptable (not working for all models yet)
-        exportable (bool): set layer config so that model is traceable / ONNX exportable (not fully impl/obeyed yet)
-        no_jit (bool): set layer config so that model doesn't utilize jit scripted layers (so far activations only)
-
-    Keyword Args:
-        drop_rate (float): dropout rate for training (default: 0.0)
-        global_pool (str): global pool type (default: 'avg')
-        **: other kwargs are model specific
-    """
-    source_name, model_name = split_model_name(model_name)
-
-    # Only EfficientNet and MobileNetV3 models have support for batchnorm params or drop_connect_rate passed as args
-    is_efficientnet = is_model_in_modules(model_name, ['efficientnet', 'mobilenetv3'])
-    if not is_efficientnet:
-        kwargs.pop('bn_tf', None)
-        kwargs.pop('bn_momentum', None)
-        kwargs.pop('bn_eps', None)
-
-    # handle backwards compat with drop_connect -> drop_path change
-    drop_connect_rate = kwargs.pop('drop_connect_rate', None)
-    if drop_connect_rate is not None and kwargs.get('drop_path_rate', None) is None:
-        print("WARNING: 'drop_connect' as an argument is deprecated, please use 'drop_path'."
-              " Setting drop_path to %f." % drop_connect_rate)
-        kwargs['drop_path_rate'] = drop_connect_rate
-
-    # Parameters that aren't supported by all models or are intended to only override model defaults if set
-    # should default to None in command line args/cfg. Remove them if they are present and not set so that
-    # non-supporting models don't break and default args remain in effect.
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-
-    if source_name == 'hf_hub':
-        # For model names specified in the form `hf_hub:path/architecture_name#revision`,
-        # load model weights + default_cfg from Hugging Face hub.
-        hf_default_cfg, model_name = load_model_config_from_hf(model_name)
-        kwargs['external_default_cfg'] = hf_default_cfg  # FIXME revamp default_cfg interface someday
-
-    if is_model(model_name):
-        create_fn = model_entrypoint(model_name)
-    else:
-        raise RuntimeError('Unknown model (%s)' % model_name)
-
-    with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit):
-        model = create_fn(pretrained=pretrained, **kwargs)
-
-    if checkpoint_path:
-        load_checkpoint(model, checkpoint_path)
-
-    return model
diff --git a/AVLFormer/src/timm/models/features.py b/AVLFormer/src/timm/models/features.py
deleted file mode 100644
index b1d6890..0000000
--- a/AVLFormer/src/timm/models/features.py
+++ /dev/null
@@ -1,284 +0,0 @@
-""" PyTorch Feature Extraction Helpers
-
-A collection of classes, functions, modules to help extract features from models
-and provide a common interface for describing them.
-
-The return_layers, module re-writing idea inspired by torchvision IntermediateLayerGetter
-https://github.com/pytorch/vision/blob/d88d8961ae51507d0cb680329d985b1488b1b76b/torchvision/models/_utils.py
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from collections import OrderedDict, defaultdict
-from copy import deepcopy
-from functools import partial
-from typing import Dict, List, Tuple
-
-import torch
-import torch.nn as nn
-
-
-class FeatureInfo:
-
-    def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]):
-        prev_reduction = 1
-        for fi in feature_info:
-            # sanity check the mandatory fields, there may be additional fields depending on the model
-            assert 'num_chs' in fi and fi['num_chs'] > 0
-            assert 'reduction' in fi and fi['reduction'] >= prev_reduction
-            prev_reduction = fi['reduction']
-            assert 'module' in fi
-        self.out_indices = out_indices
-        self.info = feature_info
-
-    def from_other(self, out_indices: Tuple[int]):
-        return FeatureInfo(deepcopy(self.info), out_indices)
-
-    def get(self, key, idx=None):
-        """ Get value by key at specified index (indices)
-        if idx == None, returns value for key at each output index
-        if idx is an integer, return value for that feature module index (ignoring output indices)
-        if idx is a list/tupple, return value for each module index (ignoring output indices)
-        """
-        if idx is None:
-            return [self.info[i][key] for i in self.out_indices]
-        if isinstance(idx, (tuple, list)):
-            return [self.info[i][key] for i in idx]
-        else:
-            return self.info[idx][key]
-
-    def get_dicts(self, keys=None, idx=None):
-        """ return info dicts for specified keys (or all if None) at specified indices (or out_indices if None)
-        """
-        if idx is None:
-            if keys is None:
-                return [self.info[i] for i in self.out_indices]
-            else:
-                return [{k: self.info[i][k] for k in keys} for i in self.out_indices]
-        if isinstance(idx, (tuple, list)):
-            return [self.info[i] if keys is None else {k: self.info[i][k] for k in keys} for i in idx]
-        else:
-            return self.info[idx] if keys is None else {k: self.info[idx][k] for k in keys}
-
-    def channels(self, idx=None):
-        """ feature channels accessor
-        """
-        return self.get('num_chs', idx)
-
-    def reduction(self, idx=None):
-        """ feature reduction (output stride) accessor
-        """
-        return self.get('reduction', idx)
-
-    def module_name(self, idx=None):
-        """ feature module name accessor
-        """
-        return self.get('module', idx)
-
-    def __getitem__(self, item):
-        return self.info[item]
-
-    def __len__(self):
-        return len(self.info)
-
-
-class FeatureHooks:
-    """ Feature Hook Helper
-
-    This module helps with the setup and extraction of hooks for extracting features from
-    internal nodes in a model by node name. This works quite well in eager Python but needs
-    redesign for torcscript.
-    """
-
-    def __init__(self, hooks, named_modules, out_map=None, default_hook_type='forward'):
-        # setup feature hooks
-        modules = {k: v for k, v in named_modules}
-        for i, h in enumerate(hooks):
-            hook_name = h['module']
-            m = modules[hook_name]
-            hook_id = out_map[i] if out_map else hook_name
-            hook_fn = partial(self._collect_output_hook, hook_id)
-            hook_type = h['hook_type'] if 'hook_type' in h else default_hook_type
-            if hook_type == 'forward_pre':
-                m.register_forward_pre_hook(hook_fn)
-            elif hook_type == 'forward':
-                m.register_forward_hook(hook_fn)
-            else:
-                assert False, "Unsupported hook type"
-        self._feature_outputs = defaultdict(OrderedDict)
-
-    def _collect_output_hook(self, hook_id, *args):
-        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
-        if isinstance(x, tuple):
-            x = x[0]  # unwrap input tuple
-        self._feature_outputs[x.device][hook_id] = x
-
-    def get_output(self, device) -> Dict[str, torch.tensor]:
-        output = self._feature_outputs[device]
-        self._feature_outputs[device] = OrderedDict()  # clear after reading
-        return output
-
-
-def _module_list(module, flatten_sequential=False):
-    # a yield/iter would be better for this but wouldn't be compatible with torchscript
-    ml = []
-    for name, module in module.named_children():
-        if flatten_sequential and isinstance(module, nn.Sequential):
-            # first level of Sequential containers is flattened into containing model
-            for child_name, child_module in module.named_children():
-                combined = [name, child_name]
-                ml.append(('_'.join(combined), '.'.join(combined), child_module))
-        else:
-            ml.append((name, name, module))
-    return ml
-
-
-def _get_feature_info(net, out_indices):
-    feature_info = getattr(net, 'feature_info')
-    if isinstance(feature_info, FeatureInfo):
-        return feature_info.from_other(out_indices)
-    elif isinstance(feature_info, (list, tuple)):
-        return FeatureInfo(net.feature_info, out_indices)
-    else:
-        assert False, "Provided feature_info is not valid"
-
-
-def _get_return_layers(feature_info, out_map):
-    module_names = feature_info.module_name()
-    return_layers = {}
-    for i, name in enumerate(module_names):
-        return_layers[name] = out_map[i] if out_map is not None else feature_info.out_indices[i]
-    return return_layers
-
-
-class FeatureDictNet(nn.ModuleDict):
-    """ Feature extractor with OrderedDict return
-
-    Wrap a model and extract features as specified by the out indices, the network is
-    partially re-built from contained modules.
-
-    There is a strong assumption that the modules have been registered into the model in the same
-    order as they are used. There should be no reuse of the same nn.Module more than once, including
-    trivial modules like `self.relu = nn.ReLU`.
-
-    Only submodules that are directly assigned to the model class (`model.feature1`) or at most
-    one Sequential container deep (`model.features.1`, with flatten_sequent=True) can be captured.
-    All Sequential containers that are directly assigned to the original model will have their
-    modules assigned to this module with the name `model.features.1` being changed to `model.features_1`
-
-    Arguments:
-        model (nn.Module): model from which we will extract the features
-        out_indices (tuple[int]): model output indices to extract features for
-        out_map (sequence): list or tuple specifying desired return id for each out index,
-            otherwise str(index) is used
-        feature_concat (bool): whether to concatenate intermediate features that are lists or tuples
-            vs select element [0]
-        flatten_sequential (bool): whether to flatten sequential modules assigned to model
-    """
-    def __init__(
-            self, model,
-            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
-        super(FeatureDictNet, self).__init__()
-        self.feature_info = _get_feature_info(model, out_indices)
-        self.concat = feature_concat
-        self.return_layers = {}
-        return_layers = _get_return_layers(self.feature_info, out_map)
-        modules = _module_list(model, flatten_sequential=flatten_sequential)
-        remaining = set(return_layers.keys())
-        layers = OrderedDict()
-        for new_name, old_name, module in modules:
-            layers[new_name] = module
-            if old_name in remaining:
-                # return id has to be consistently str type for torchscript
-                self.return_layers[new_name] = str(return_layers[old_name])
-                remaining.remove(old_name)
-            if not remaining:
-                break
-        assert not remaining and len(self.return_layers) == len(return_layers), \
-            f'Return layers ({remaining}) are not present in model'
-        self.update(layers)
-
-    def _collect(self, x) -> (Dict[str, torch.Tensor]):
-        out = OrderedDict()
-        for name, module in self.items():
-            x = module(x)
-            if name in self.return_layers:
-                out_id = self.return_layers[name]
-                if isinstance(x, (tuple, list)):
-                    # If model tap is a tuple or list, concat or select first element
-                    # FIXME this may need to be more generic / flexible for some nets
-                    out[out_id] = torch.cat(x, 1) if self.concat else x[0]
-                else:
-                    out[out_id] = x
-        return out
-
-    def forward(self, x) -> Dict[str, torch.Tensor]:
-        return self._collect(x)
-
-
-class FeatureListNet(FeatureDictNet):
-    """ Feature extractor with list return
-
-    See docstring for FeatureDictNet above, this class exists only to appease Torchscript typing constraints.
-    In eager Python we could have returned List[Tensor] vs Dict[id, Tensor] based on a member bool.
-    """
-    def __init__(
-            self, model,
-            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
-        super(FeatureListNet, self).__init__(
-            model, out_indices=out_indices, out_map=out_map, feature_concat=feature_concat,
-            flatten_sequential=flatten_sequential)
-
-    def forward(self, x) -> (List[torch.Tensor]):
-        return list(self._collect(x).values())
-
-
-class FeatureHookNet(nn.ModuleDict):
-    """ FeatureHookNet
-
-    Wrap a model and extract features specified by the out indices using forward/forward-pre hooks.
-
-    If `no_rewrite` is True, features are extracted via hooks without modifying the underlying
-    network in any way.
-
-    If `no_rewrite` is False, the model will be re-written as in the
-    FeatureList/FeatureDict case by folding first to second (Sequential only) level modules into this one.
-
-    FIXME this does not currently work with Torchscript, see FeatureHooks class
-    """
-    def __init__(
-            self, model,
-            out_indices=(0, 1, 2, 3, 4), out_map=None, out_as_dict=False, no_rewrite=False,
-            feature_concat=False, flatten_sequential=False, default_hook_type='forward'):
-        super(FeatureHookNet, self).__init__()
-        assert not torch.jit.is_scripting()
-        self.feature_info = _get_feature_info(model, out_indices)
-        self.out_as_dict = out_as_dict
-        layers = OrderedDict()
-        hooks = []
-        if no_rewrite:
-            assert not flatten_sequential
-            if hasattr(model, 'reset_classifier'):  # make sure classifier is removed?
-                model.reset_classifier(0)
-            layers['body'] = model
-            hooks.extend(self.feature_info.get_dicts())
-        else:
-            modules = _module_list(model, flatten_sequential=flatten_sequential)
-            remaining = {f['module']: f['hook_type'] if 'hook_type' in f else default_hook_type
-                         for f in self.feature_info.get_dicts()}
-            for new_name, old_name, module in modules:
-                layers[new_name] = module
-                for fn, fm in module.named_modules(prefix=old_name):
-                    if fn in remaining:
-                        hooks.append(dict(module=fn, hook_type=remaining[fn]))
-                        del remaining[fn]
-                if not remaining:
-                    break
-            assert not remaining, f'Return layers ({remaining}) are not present in model'
-        self.update(layers)
-        self.hooks = FeatureHooks(hooks, model.named_modules(), out_map=out_map)
-
-    def forward(self, x):
-        for name, module in self.items():
-            x = module(x)
-        out = self.hooks.get_output(x.device)
-        return out if self.out_as_dict else list(out.values())
diff --git a/AVLFormer/src/timm/models/ghostnet.py b/AVLFormer/src/timm/models/ghostnet.py
deleted file mode 100644
index 749bdfa..0000000
--- a/AVLFormer/src/timm/models/ghostnet.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""
-An implementation of GhostNet Model as defined in:
-GhostNet: More Features from Cheap Operations. https://arxiv.org/abs/1911.11907
-The train script of the model is similar to that of MobileNetV3
-Original model: https://github.com/huawei-noah/CV-backbones/tree/master/ghostnet_pytorch
-"""
-import math
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .layers import SelectAdaptivePool2d, Linear, hard_sigmoid
-from .efficientnet_blocks import SqueezeExcite, ConvBnAct, make_divisible
-from .helpers import build_model_with_cfg
-from .registry import register_model
-
-
-__all__ = ['GhostNet']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv_stem', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'ghostnet_050': _cfg(url=''),
-    'ghostnet_100': _cfg(
-        url='https://github.com/huawei-noah/CV-backbones/releases/download/ghostnet_pth/ghostnet_1x.pth'),
-    'ghostnet_130': _cfg(url=''),
-}
-
-
-_SE_LAYER = partial(SqueezeExcite, gate_fn=hard_sigmoid, divisor=4)
-
-
-class GhostModule(nn.Module):
-    def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True):
-        super(GhostModule, self).__init__()
-        self.oup = oup
-        init_channels = math.ceil(oup / ratio)
-        new_channels = init_channels * (ratio - 1)
-
-        self.primary_conv = nn.Sequential(
-            nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
-            nn.BatchNorm2d(init_channels),
-            nn.ReLU(inplace=True) if relu else nn.Sequential(),
-        )
-
-        self.cheap_operation = nn.Sequential(
-            nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
-            nn.BatchNorm2d(new_channels),
-            nn.ReLU(inplace=True) if relu else nn.Sequential(),
-        )
-
-    def forward(self, x):
-        x1 = self.primary_conv(x)
-        x2 = self.cheap_operation(x1)
-        out = torch.cat([x1, x2], dim=1)
-        return out[:, :self.oup, :, :]
-
-
-class GhostBottleneck(nn.Module):
-    """ Ghost bottleneck w/ optional SE"""
-
-    def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3,
-                 stride=1, act_layer=nn.ReLU, se_ratio=0.):
-        super(GhostBottleneck, self).__init__()
-        has_se = se_ratio is not None and se_ratio > 0.
-        self.stride = stride
-
-        # Point-wise expansion
-        self.ghost1 = GhostModule(in_chs, mid_chs, relu=True)
-
-        # Depth-wise convolution
-        if self.stride > 1:
-            self.conv_dw = nn.Conv2d(
-                mid_chs, mid_chs, dw_kernel_size, stride=stride,
-                padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False)
-            self.bn_dw = nn.BatchNorm2d(mid_chs)
-        else:
-            self.conv_dw = None
-            self.bn_dw = None
-
-        # Squeeze-and-excitation
-        self.se = _SE_LAYER(mid_chs, se_ratio=se_ratio) if has_se else None
-
-        # Point-wise linear projection
-        self.ghost2 = GhostModule(mid_chs, out_chs, relu=False)
-        
-        # shortcut
-        if in_chs == out_chs and self.stride == 1:
-            self.shortcut = nn.Sequential()
-        else:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(
-                    in_chs, in_chs, dw_kernel_size, stride=stride,
-                    padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False),
-                nn.BatchNorm2d(in_chs),
-                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
-                nn.BatchNorm2d(out_chs),
-            )
-
-
-    def forward(self, x):
-        residual = x
-
-        # 1st ghost bottleneck
-        x = self.ghost1(x)
-
-        # Depth-wise convolution
-        if self.conv_dw is not None:
-            x = self.conv_dw(x)
-            x = self.bn_dw(x)
-
-        # Squeeze-and-excitation
-        if self.se is not None:
-            x = self.se(x)
-
-        # 2nd ghost bottleneck
-        x = self.ghost2(x)
-        
-        x += self.shortcut(residual)
-        return x
-
-
-class GhostNet(nn.Module):
-    def __init__(self, cfgs, num_classes=1000, width=1.0, dropout=0.2, in_chans=3, output_stride=32):
-        super(GhostNet, self).__init__()
-        # setting of inverted residual blocks
-        assert output_stride == 32, 'only output_stride==32 is valid, dilation not supported'
-        self.cfgs = cfgs
-        self.num_classes = num_classes
-        self.dropout = dropout
-        self.feature_info = []
-
-        # building first layer
-        stem_chs = make_divisible(16 * width, 4)
-        self.conv_stem = nn.Conv2d(in_chans, stem_chs, 3, 2, 1, bias=False)
-        self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=f'conv_stem'))
-        self.bn1 = nn.BatchNorm2d(stem_chs)
-        self.act1 = nn.ReLU(inplace=True)
-        prev_chs = stem_chs
-
-        # building inverted residual blocks
-        stages = nn.ModuleList([])
-        block = GhostBottleneck
-        stage_idx = 0
-        net_stride = 2
-        for cfg in self.cfgs:
-            layers = []
-            s = 1
-            for k, exp_size, c, se_ratio, s in cfg:
-                out_chs = make_divisible(c * width, 4)
-                mid_chs = make_divisible(exp_size * width, 4)
-                layers.append(block(prev_chs, mid_chs, out_chs, k, s, se_ratio=se_ratio))
-                prev_chs = out_chs
-            if s > 1:
-                net_stride *= 2
-                self.feature_info.append(dict(
-                    num_chs=prev_chs, reduction=net_stride, module=f'blocks.{stage_idx}'))
-            stages.append(nn.Sequential(*layers))
-            stage_idx += 1
-
-        out_chs = make_divisible(exp_size * width, 4)
-        stages.append(nn.Sequential(ConvBnAct(prev_chs, out_chs, 1)))
-        self.pool_dim = prev_chs = out_chs
-        
-        self.blocks = nn.Sequential(*stages)        
-
-        # building last several layers
-        self.num_features = out_chs = 1280
-        self.global_pool = SelectAdaptivePool2d(pool_type='avg')
-        self.conv_head = nn.Conv2d(prev_chs, out_chs, 1, 1, 0, bias=True)
-        self.act2 = nn.ReLU(inplace=True)
-        self.classifier = Linear(out_chs, num_classes)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        # cannot meaningfully change pooling of efficient head after creation
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = Linear(self.pool_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.blocks(x)
-        x = self.global_pool(x)
-        x = self.conv_head(x)
-        x = self.act2(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        if not self.global_pool.is_identity():
-            x = x.view(x.size(0), -1)
-        if self.dropout > 0.:
-            x = F.dropout(x, p=self.dropout, training=self.training)
-        x = self.classifier(x)
-        return x
-
-
-def _create_ghostnet(variant, width=1.0, pretrained=False, **kwargs):
-    """
-    Constructs a GhostNet model
-    """
-    cfgs = [
-        # k, t, c, SE, s 
-        # stage1
-        [[3,  16,  16, 0, 1]],
-        # stage2
-        [[3,  48,  24, 0, 2]],
-        [[3,  72,  24, 0, 1]],
-        # stage3
-        [[5,  72,  40, 0.25, 2]],
-        [[5, 120,  40, 0.25, 1]],
-        # stage4
-        [[3, 240,  80, 0, 2]],
-        [[3, 200,  80, 0, 1],
-         [3, 184,  80, 0, 1],
-         [3, 184,  80, 0, 1],
-         [3, 480, 112, 0.25, 1],
-         [3, 672, 112, 0.25, 1]
-        ],
-        # stage5
-        [[5, 672, 160, 0.25, 2]],
-        [[5, 960, 160, 0, 1],
-         [5, 960, 160, 0.25, 1],
-         [5, 960, 160, 0, 1],
-         [5, 960, 160, 0.25, 1]
-        ]
-    ]
-    model_kwargs = dict(
-        cfgs=cfgs,
-        width=width,
-        **kwargs,
-    )
-    return build_model_with_cfg(
-        GhostNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True),
-        **model_kwargs)
-
-
-@register_model
-def ghostnet_050(pretrained=False, **kwargs):
-    """ GhostNet-0.5x """
-    model = _create_ghostnet('ghostnet_050', width=0.5, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def ghostnet_100(pretrained=False, **kwargs):
-    """ GhostNet-1.0x """
-    model = _create_ghostnet('ghostnet_100', width=1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def ghostnet_130(pretrained=False, **kwargs):
-    """ GhostNet-1.3x """
-    model = _create_ghostnet('ghostnet_130', width=1.3, pretrained=pretrained, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/gluon_resnet.py b/AVLFormer/src/timm/models/gluon_resnet.py
deleted file mode 100644
index 6db1f3b..0000000
--- a/AVLFormer/src/timm/models/gluon_resnet.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""Pytorch impl of MxNet Gluon ResNet/(SE)ResNeXt variants
-This file evolved from https://github.com/pytorch/vision 'resnet.py' with (SE)-ResNeXt additions
-and ports of Gluon variations (https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnet.py) 
-by Ross Wightman
-"""
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import SEModule
-from .registry import register_model
-from .resnet import ResNet, Bottleneck, BasicBlock
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'gluon_resnet18_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet18_v1b-0757602b.pth'),
-    'gluon_resnet34_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet34_v1b-c6d82d59.pth'),
-    'gluon_resnet50_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1b-0ebe02e2.pth'),
-    'gluon_resnet101_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1b-3b017079.pth'),
-    'gluon_resnet152_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1b-c1edb0dd.pth'),
-    'gluon_resnet50_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1c-48092f55.pth',
-                               first_conv='conv1.0'),
-    'gluon_resnet101_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1c-1f26822a.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnet152_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1c-a3bb0b98.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnet50_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1d-818a1b1b.pth',
-                               first_conv='conv1.0'),
-    'gluon_resnet101_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1d-0f9c8644.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnet152_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1d-bd354e12.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnet50_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1s-1762acc0.pth',
-                               first_conv='conv1.0'),
-    'gluon_resnet101_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1s-60fe0cc1.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnet152_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1s-dcc41b81.pth',
-                                first_conv='conv1.0'),
-    'gluon_resnext50_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext50_32x4d-e6a097c1.pth'),
-    'gluon_resnext101_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext101_32x4d-b253c8c4.pth'),
-    'gluon_resnext101_64x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext101_64x4d-f9a8e184.pth'),
-    'gluon_seresnext50_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext50_32x4d-90cf2d6e.pth'),
-    'gluon_seresnext101_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext101_32x4d-cf52900d.pth'),
-    'gluon_seresnext101_64x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext101_64x4d-f9926f93.pth'),
-    'gluon_senet154': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_senet154-70a1a3c0.pth',
-                           first_conv='conv1.0'),
-}
-
-
-def _create_resnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def gluon_resnet18_v1b(pretrained=False, **kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
-    return _create_resnet('gluon_resnet18_v1b', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet34_v1b(pretrained=False, **kwargs):
-    """Constructs a ResNet-34 model.
-    """
-    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs)
-    return _create_resnet('gluon_resnet34_v1b', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet50_v1b(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
-    return _create_resnet('gluon_resnet50_v1b', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet101_v1b(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs)
-    return _create_resnet('gluon_resnet101_v1b', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet152_v1b(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs)
-    return _create_resnet('gluon_resnet152_v1b', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet50_v1c(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet50_v1c', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet101_v1c(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet101_v1c', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet152_v1c(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet152_v1c', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet50_v1d(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('gluon_resnet50_v1d', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet101_v1d(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('gluon_resnet101_v1d', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet152_v1d(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('gluon_resnet152_v1d', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet50_v1s(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=64, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet50_v1s', pretrained, **model_args)
-
-
-
-@register_model
-def gluon_resnet101_v1s(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=64, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet101_v1s', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnet152_v1s(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=64, stem_type='deep', **kwargs)
-    return _create_resnet('gluon_resnet152_v1s', pretrained, **model_args)
-
-
-
-@register_model
-def gluon_resnext50_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt50-32x4d model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('gluon_resnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnext101_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt-101 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('gluon_resnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_resnext101_64x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt-101 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, **kwargs)
-    return _create_resnet('gluon_resnext101_64x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_seresnext50_32x4d(pretrained=False, **kwargs):
-    """Constructs a SEResNeXt50-32x4d model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4,
-        block_args=dict(attn_layer=SEModule), **kwargs)
-    return _create_resnet('gluon_seresnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_seresnext101_32x4d(pretrained=False, **kwargs):
-    """Constructs a SEResNeXt-101-32x4d model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4,
-        block_args=dict(attn_layer=SEModule), **kwargs)
-    return _create_resnet('gluon_seresnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_seresnext101_64x4d(pretrained=False, **kwargs):
-    """Constructs a SEResNeXt-101-64x4d model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4,
-        block_args=dict(attn_layer=SEModule), **kwargs)
-    return _create_resnet('gluon_seresnext101_64x4d', pretrained, **model_args)
-
-
-@register_model
-def gluon_senet154(pretrained=False, **kwargs):
-    """Constructs an SENet-154 model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep',
-        down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer=SEModule), **kwargs)
-    return _create_resnet('gluon_senet154', pretrained, **model_args)
diff --git a/AVLFormer/src/timm/models/gluon_xception.py b/AVLFormer/src/timm/models/gluon_xception.py
deleted file mode 100644
index 4ebd197..0000000
--- a/AVLFormer/src/timm/models/gluon_xception.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""Pytorch impl of Gluon Xception
-This is a port of the Gluon Xception code and weights, itself ported from a PyTorch DeepLab impl.
-
-Gluon model: (https://gluon-cv.mxnet.io/_modules/gluoncv/model_zoo/xception.html)
-Original PyTorch DeepLab impl: https://github.com/jfzhang95/pytorch-deeplab-xception
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from collections import OrderedDict
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier, get_padding
-from .registry import register_model
-
-__all__ = ['Xception65']
-
-default_cfgs = {
-    'gluon_xception65': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_xception-7015a15c.pth',
-        'input_size': (3, 299, 299),
-        'crop_pct': 0.903,
-        'pool_size': (10, 10),
-        'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN,
-        'std': IMAGENET_DEFAULT_STD,
-        'num_classes': 1000,
-        'first_conv': 'conv1',
-        'classifier': 'fc'
-        # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
-    },
-}
-
-""" PADDING NOTES
-The original PyTorch and Gluon impl of these models dutifully reproduced the 
-aligned padding added to Tensorflow models for Deeplab. This padding was compensating
-for  Tensorflow 'SAME' padding. PyTorch symmetric padding behaves the way we'd want it to. 
-"""
-
-
-class SeparableConv2d(nn.Module):
-    def __init__(self, inplanes, planes, kernel_size=3, stride=1, dilation=1, bias=False, norm_layer=None):
-        super(SeparableConv2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.dilation = dilation
-
-        # depthwise convolution
-        padding = get_padding(kernel_size, stride, dilation)
-        self.conv_dw = nn.Conv2d(
-            inplanes, inplanes, kernel_size, stride=stride,
-            padding=padding, dilation=dilation, groups=inplanes, bias=bias)
-        self.bn = norm_layer(num_features=inplanes)
-        # pointwise convolution
-        self.conv_pw = nn.Conv2d(inplanes, planes, kernel_size=1, bias=bias)
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.bn(x)
-        x = self.conv_pw(x)
-        return x
-
-
-class Block(nn.Module):
-    def __init__(self, inplanes, planes, stride=1, dilation=1, start_with_relu=True, norm_layer=None):
-        super(Block, self).__init__()
-        if isinstance(planes, (list, tuple)):
-            assert len(planes) == 3
-        else:
-            planes = (planes,) * 3
-        outplanes = planes[-1]
-
-        if outplanes != inplanes or stride != 1:
-            self.skip = nn.Sequential()
-            self.skip.add_module('conv1', nn.Conv2d(
-                inplanes, outplanes, 1, stride=stride, bias=False)),
-            self.skip.add_module('bn1', norm_layer(num_features=outplanes))
-        else:
-            self.skip = None
-
-        rep = OrderedDict()
-        for i in range(3):
-            rep['act%d' % (i + 1)] = nn.ReLU(inplace=True)
-            rep['conv%d' % (i + 1)] = SeparableConv2d(
-                inplanes, planes[i], 3, stride=stride if i == 2 else 1, dilation=dilation, norm_layer=norm_layer)
-            rep['bn%d' % (i + 1)] = norm_layer(planes[i])
-            inplanes = planes[i]
-
-        if not start_with_relu:
-            del rep['act1']
-        else:
-            rep['act1'] = nn.ReLU(inplace=False)
-        self.rep = nn.Sequential(rep)
-
-    def forward(self, x):
-        skip = x
-        if self.skip is not None:
-            skip = self.skip(skip)
-        x = self.rep(x) + skip
-        return x
-
-
-class Xception65(nn.Module):
-    """Modified Aligned Xception.
-
-    NOTE: only the 65 layer version is included here, the 71 layer variant
-    was not correct and had no pretrained weights
-    """
-
-    def __init__(self, num_classes=1000, in_chans=3, output_stride=32, norm_layer=nn.BatchNorm2d,
-                 drop_rate=0., global_pool='avg'):
-        super(Xception65, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        if output_stride == 32:
-            entry_block3_stride = 2
-            exit_block20_stride = 2
-            middle_dilation = 1
-            exit_dilation = (1, 1)
-        elif output_stride == 16:
-            entry_block3_stride = 2
-            exit_block20_stride = 1
-            middle_dilation = 1
-            exit_dilation = (1, 2)
-        elif output_stride == 8:
-            entry_block3_stride = 1
-            exit_block20_stride = 1
-            middle_dilation = 2
-            exit_dilation = (2, 4)
-        else:
-            raise NotImplementedError
-
-        # Entry flow
-        self.conv1 = nn.Conv2d(in_chans, 32, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn1 = norm_layer(num_features=32)
-        self.act1 = nn.ReLU(inplace=True)
-
-        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn2 = norm_layer(num_features=64)
-        self.act2 = nn.ReLU(inplace=True)
-
-        self.block1 = Block(64, 128, stride=2, start_with_relu=False, norm_layer=norm_layer)
-        self.block1_act = nn.ReLU(inplace=True)
-        self.block2 = Block(128, 256, stride=2, start_with_relu=False, norm_layer=norm_layer)
-        self.block3 = Block(256, 728, stride=entry_block3_stride, norm_layer=norm_layer)
-
-        # Middle flow
-        self.mid = nn.Sequential(OrderedDict([('block%d' % i, Block(
-            728, 728, stride=1, dilation=middle_dilation, norm_layer=norm_layer)) for i in range(4, 20)]))
-
-        # Exit flow
-        self.block20 = Block(
-            728, (728, 1024, 1024), stride=exit_block20_stride, dilation=exit_dilation[0], norm_layer=norm_layer)
-        self.block20_act = nn.ReLU(inplace=True)
-
-        self.conv3 = SeparableConv2d(1024, 1536, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer)
-        self.bn3 = norm_layer(num_features=1536)
-        self.act3 = nn.ReLU(inplace=True)
-
-        self.conv4 = SeparableConv2d(1536, 1536, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer)
-        self.bn4 = norm_layer(num_features=1536)
-        self.act4 = nn.ReLU(inplace=True)
-
-        self.num_features = 2048
-        self.conv5 = SeparableConv2d(
-            1536, self.num_features, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer)
-        self.bn5 = norm_layer(num_features=self.num_features)
-        self.act5 = nn.ReLU(inplace=True)
-        self.feature_info = [
-            dict(num_chs=64, reduction=2, module='act2'),
-            dict(num_chs=128, reduction=4, module='block1_act'),
-            dict(num_chs=256, reduction=8, module='block3.rep.act1'),
-            dict(num_chs=728, reduction=16, module='block20.rep.act1'),
-            dict(num_chs=2048, reduction=32, module='act5'),
-        ]
-
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        # Entry flow
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        x = self.conv2(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        x = self.block1(x)
-        x = self.block1_act(x)
-        # c1 = x
-        x = self.block2(x)
-        # c2 = x
-        x = self.block3(x)
-
-        # Middle flow
-        x = self.mid(x)
-        # c3 = x
-
-        # Exit flow
-        x = self.block20(x)
-        x = self.block20_act(x)
-        x = self.conv3(x)
-        x = self.bn3(x)
-        x = self.act3(x)
-
-        x = self.conv4(x)
-        x = self.bn4(x)
-        x = self.act4(x)
-
-        x = self.conv5(x)
-        x = self.bn5(x)
-        x = self.act5(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate:
-            F.dropout(x, self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x
-
-
-def _create_gluon_xception(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        Xception65, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(feature_cls='hook'),
-        **kwargs)
-
-
-@register_model
-def gluon_xception65(pretrained=False, **kwargs):
-    """ Modified Aligned Xception-65
-    """
-    return _create_gluon_xception('gluon_xception65', pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/hardcorenas.py b/AVLFormer/src/timm/models/hardcorenas.py
deleted file mode 100644
index f1f6b4d..0000000
--- a/AVLFormer/src/timm/models/hardcorenas.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import torch.nn as nn
-from .efficientnet_builder import decode_arch_def, resolve_bn_args
-from .mobilenetv3 import MobileNetV3, MobileNetV3Features, build_model_with_cfg, default_cfg_for_features
-from .layers import hard_sigmoid
-from .efficientnet_blocks import resolve_act_layer
-from .registry import register_model
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv_stem', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'hardcorenas_a': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_A_Green_38ms_75.9_23474aeb.pth'),
-    'hardcorenas_b': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_B_Green_40ms_76.5_1f882d1e.pth'),
-    'hardcorenas_c': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_C_Green_44ms_77.1_d4148c9e.pth'),
-    'hardcorenas_d': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_D_Green_50ms_77.4_23e3cdde.pth'),
-    'hardcorenas_e': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_E_Green_55ms_77.9_90f20e8a.pth'),
-    'hardcorenas_f': _cfg(url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/public/HardCoReNAS/HardCoreNAS_F_Green_60ms_78.1_2855edf1.pth'),
-}
-
-
-def _gen_hardcorenas(pretrained, variant, arch_def, **kwargs):
-    """Creates a hardcorenas model
-
-    Ref impl: https://github.com/Alibaba-MIIL/HardCoReNAS
-    Paper: https://arxiv.org/abs/2102.11646
-
-    """
-    num_features = 1280
-
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        num_features=num_features,
-        stem_size=32,
-        channel_multiplier=1,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
-        se_kwargs=dict(act_layer=nn.ReLU, gate_fn=hard_sigmoid, reduce_mid=True, divisor=8),
-        **kwargs,
-    )
-
-    features_only = False
-    model_cls = MobileNetV3
-    kwargs_filter = None
-    if model_kwargs.pop('features_only', False):
-        features_only = True
-        kwargs_filter = ('num_classes', 'num_features', 'global_pool', 'head_conv', 'head_bias', 'global_pool')
-        model_cls = MobileNetV3Features
-    model = build_model_with_cfg(
-        model_cls, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        pretrained_strict=not features_only,
-        kwargs_filter=kwargs_filter,
-        **model_kwargs)
-    if features_only:
-        model.default_cfg = default_cfg_for_features(model.default_cfg)
-    return model
-
-
-@register_model
-def hardcorenas_a(pretrained=False, **kwargs):
-    """ hardcorenas_A """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25'],
-                ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e6_c40_nre_se0.25'],
-                ['ir_r1_k5_s2_e6_c80_se0.25', 'ir_r1_k5_s1_e6_c80_se0.25'],
-                ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_a', arch_def=arch_def, **kwargs)
-    return model
-
-
-@register_model
-def hardcorenas_b(pretrained=False, **kwargs):
-    """ hardcorenas_B """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'],
-                ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25', 'ir_r1_k3_s1_e3_c24_nre'],
-                ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre'],
-                ['ir_r1_k5_s2_e3_c80', 'ir_r1_k5_s1_e3_c80', 'ir_r1_k3_s1_e3_c80', 'ir_r1_k3_s1_e3_c80'],
-                ['ir_r1_k5_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e3_c192_se0.25'],
-                ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_b', arch_def=arch_def, **kwargs)
-    return model
-
-
-@register_model
-def hardcorenas_c(pretrained=False, **kwargs):
-    """ hardcorenas_C """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25'],
-                ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre',
-                 'ir_r1_k5_s1_e3_c40_nre'],
-                ['ir_r1_k5_s2_e4_c80', 'ir_r1_k5_s1_e6_c80_se0.25', 'ir_r1_k3_s1_e3_c80', 'ir_r1_k3_s1_e3_c80'],
-                ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e3_c192_se0.25'],
-                ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_c', arch_def=arch_def, **kwargs)
-    return model
-
-
-@register_model
-def hardcorenas_d(pretrained=False, **kwargs):
-    """ hardcorenas_D """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'],
-                ['ir_r1_k5_s2_e3_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25', 'ir_r1_k3_s1_e3_c40_nre_se0.25'],
-                ['ir_r1_k5_s2_e4_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25',
-                 'ir_r1_k3_s1_e3_c80_se0.25'],
-                ['ir_r1_k3_s1_e4_c112_se0.25', 'ir_r1_k5_s1_e4_c112_se0.25', 'ir_r1_k3_s1_e3_c112_se0.25',
-                 'ir_r1_k5_s1_e3_c112_se0.25'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25',
-                 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_d', arch_def=arch_def, **kwargs)
-    return model
-
-
-@register_model
-def hardcorenas_e(pretrained=False, **kwargs):
-    """ hardcorenas_E """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'],
-                ['ir_r1_k5_s2_e6_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25',
-                 'ir_r1_k3_s1_e3_c40_nre_se0.25'], ['ir_r1_k5_s2_e4_c80_se0.25', 'ir_r1_k3_s1_e6_c80_se0.25'],
-                ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25',
-                 'ir_r1_k5_s1_e3_c112_se0.25'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25',
-                 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_e', arch_def=arch_def, **kwargs)
-    return model
-
-
-@register_model
-def hardcorenas_f(pretrained=False, **kwargs):
-    """ hardcorenas_F """
-    arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'],
-                ['ir_r1_k5_s2_e6_c40_nre_se0.25', 'ir_r1_k5_s1_e6_c40_nre_se0.25'],
-                ['ir_r1_k5_s2_e6_c80_se0.25', 'ir_r1_k5_s1_e6_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25',
-                 'ir_r1_k3_s1_e3_c80_se0.25'],
-                ['ir_r1_k3_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25',
-                 'ir_r1_k3_s1_e3_c112_se0.25'],
-                ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e6_c192_se0.25',
-                 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']]
-    model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_f', arch_def=arch_def, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/helpers.py b/AVLFormer/src/timm/models/helpers.py
deleted file mode 100644
index e9ac7f0..0000000
--- a/AVLFormer/src/timm/models/helpers.py
+++ /dev/null
@@ -1,475 +0,0 @@
-""" Model creation / weight loading / state_dict helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import logging
-import os
-import math
-from collections import OrderedDict
-from copy import deepcopy
-from typing import Any, Callable, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-
-from .features import FeatureListNet, FeatureDictNet, FeatureHookNet
-from .hub import has_hf_hub, download_cached_file, load_state_dict_from_hf, load_state_dict_from_url
-from .layers import Conv2dSame, Linear
-
-
-_logger = logging.getLogger(__name__)
-
-
-def load_state_dict(checkpoint_path, use_ema=False):
-    if checkpoint_path and os.path.isfile(checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location='cpu')
-        state_dict_key = 'state_dict'
-        if isinstance(checkpoint, dict):
-            if use_ema and 'state_dict_ema' in checkpoint:
-                state_dict_key = 'state_dict_ema'
-        if state_dict_key and state_dict_key in checkpoint:
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint[state_dict_key].items():
-                # strip `module.` prefix
-                name = k[7:] if k.startswith('module') else k
-                new_state_dict[name] = v
-            state_dict = new_state_dict
-        else:
-            state_dict = checkpoint
-        _logger.info("Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path))
-        return state_dict
-    else:
-        _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
-        raise FileNotFoundError()
-
-
-def load_checkpoint(model, checkpoint_path, use_ema=False, strict=True):
-    state_dict = load_state_dict(checkpoint_path, use_ema)
-    model.load_state_dict(state_dict, strict=strict)
-
-
-def resume_checkpoint(model, checkpoint_path, optimizer=None, loss_scaler=None, log_info=True):
-    resume_epoch = None
-    if os.path.isfile(checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location='cpu')
-        if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
-            if log_info:
-                _logger.info('Restoring model state from checkpoint...')
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint['state_dict'].items():
-                name = k[7:] if k.startswith('module') else k
-                new_state_dict[name] = v
-            model.load_state_dict(new_state_dict)
-
-            if optimizer is not None and 'optimizer' in checkpoint:
-                if log_info:
-                    _logger.info('Restoring optimizer state from checkpoint...')
-                optimizer.load_state_dict(checkpoint['optimizer'])
-
-            if loss_scaler is not None and loss_scaler.state_dict_key in checkpoint:
-                if log_info:
-                    _logger.info('Restoring AMP loss scaler state from checkpoint...')
-                loss_scaler.load_state_dict(checkpoint[loss_scaler.state_dict_key])
-
-            if 'epoch' in checkpoint:
-                resume_epoch = checkpoint['epoch']
-                if 'version' in checkpoint and checkpoint['version'] > 1:
-                    resume_epoch += 1  # start at the next epoch, old checkpoints incremented before save
-
-            if log_info:
-                _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
-        else:
-            model.load_state_dict(checkpoint)
-            if log_info:
-                _logger.info("Loaded checkpoint '{}'".format(checkpoint_path))
-        return resume_epoch
-    else:
-        _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
-        raise FileNotFoundError()
-
-
-def load_custom_pretrained(model, default_cfg=None, load_fn=None, progress=False, check_hash=False):
-    r"""Loads a custom (read non .pth) weight file
-
-    Downloads checkpoint file into cache-dir like torch.hub based loaders, but calls
-    a passed in custom load fun, or the `load_pretrained` model member fn.
-
-    If the object is already present in `model_dir`, it's deserialized and returned.
-    The default value of `model_dir` is ``<hub_dir>/checkpoints`` where
-    `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`.
-
-    Args:
-        model: The instantiated model to load weights into
-        default_cfg (dict): Default pretrained model cfg
-        load_fn: An external stand alone fn that loads weights into provided model, otherwise a fn named
-            'laod_pretrained' on the model will be called if it exists
-        progress (bool, optional): whether or not to display a progress bar to stderr. Default: False
-        check_hash(bool, optional): If True, the filename part of the URL should follow the naming convention
-            ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
-            digits of the SHA256 hash of the contents of the file. The hash is used to
-            ensure unique names and to verify the contents of the file. Default: False
-    """
-    default_cfg = default_cfg or getattr(model, 'default_cfg', None) or {}
-    pretrained_url = default_cfg.get('url', None)
-    if not pretrained_url:
-        _logger.warning("No pretrained weights exist for this model. Using random initialization.")
-        return
-    cached_file = download_cached_file(default_cfg['url'], check_hash=check_hash, progress=progress)
-
-    if load_fn is not None:
-        load_fn(model, cached_file)
-    elif hasattr(model, 'load_pretrained'):
-        model.load_pretrained(cached_file)
-    else:
-        _logger.warning("Valid function to load pretrained weights is not available, using random initialization.")
-
-
-def adapt_input_conv(in_chans, conv_weight):
-    conv_type = conv_weight.dtype
-    conv_weight = conv_weight.float()  # Some weights are in torch.half, ensure it's float for sum on CPU
-    O, I, J, K = conv_weight.shape
-    if in_chans == 1:
-        if I > 3:
-            assert conv_weight.shape[1] % 3 == 0
-            # For models with space2depth stems
-            conv_weight = conv_weight.reshape(O, I // 3, 3, J, K)
-            conv_weight = conv_weight.sum(dim=2, keepdim=False)
-        else:
-            conv_weight = conv_weight.sum(dim=1, keepdim=True)
-    elif in_chans != 3:
-        if I != 3:
-            raise NotImplementedError('Weight format not supported by conversion.')
-        else:
-            # NOTE this strategy should be better than random init, but there could be other combinations of
-            # the original RGB input layer weights that'd work better for specific cases.
-            repeat = int(math.ceil(in_chans / 3))
-            conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :]
-            conv_weight *= (3 / float(in_chans))
-    conv_weight = conv_weight.to(conv_type)
-    return conv_weight
-
-
-def load_pretrained(model, default_cfg=None, num_classes=1000, in_chans=3, filter_fn=None, strict=True, progress=False):
-    """ Load pretrained checkpoint
-
-    Args:
-        model (nn.Module) : PyTorch model module
-        default_cfg (Optional[Dict]): default configuration for pretrained weights / target dataset
-        num_classes (int): num_classes for model
-        in_chans (int): in_chans for model
-        filter_fn (Optional[Callable]): state_dict filter fn for load (takes state_dict, model as args)
-        strict (bool): strict load of checkpoint
-        progress (bool): enable progress bar for weight download
-
-    """
-    default_cfg = default_cfg or getattr(model, 'default_cfg', None) or {}
-    pretrained_url = default_cfg.get('url', None)
-    hf_hub_id = default_cfg.get('hf_hub', None)
-    if not pretrained_url and not hf_hub_id:
-        _logger.warning("No pretrained weights exist for this model. Using random initialization.")
-        return
-    if hf_hub_id and has_hf_hub(necessary=not pretrained_url):
-        _logger.info(f'Loading pretrained weights from Hugging Face hub ({hf_hub_id})')
-        state_dict = load_state_dict_from_hf(hf_hub_id)
-    else:
-        _logger.info(f'Loading pretrained weights from url ({pretrained_url})')
-        state_dict = load_state_dict_from_url(pretrained_url, progress=progress, map_location='cpu')
-    if filter_fn is not None:
-        # for backwards compat with filter fn that take one arg, try one first, the two
-        try:
-            state_dict = filter_fn(state_dict)
-        except TypeError:
-            state_dict = filter_fn(state_dict, model)
-
-    input_convs = default_cfg.get('first_conv', None)
-    if input_convs is not None and in_chans != 3:
-        if isinstance(input_convs, str):
-            input_convs = (input_convs,)
-        for input_conv_name in input_convs:
-            weight_name = input_conv_name + '.weight'
-            try:
-                state_dict[weight_name] = adapt_input_conv(in_chans, state_dict[weight_name])
-                _logger.info(
-                    f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)')
-            except NotImplementedError as e:
-                del state_dict[weight_name]
-                strict = False
-                _logger.warning(
-                    f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.')
-
-    classifiers = default_cfg.get('classifier', None)
-    label_offset = default_cfg.get('label_offset', 0)
-    if classifiers is not None:
-        if isinstance(classifiers, str):
-            classifiers = (classifiers,)
-        if num_classes != default_cfg['num_classes']:
-            for classifier_name in classifiers:
-                # completely discard fully connected if model num_classes doesn't match pretrained weights
-                del state_dict[classifier_name + '.weight']
-                del state_dict[classifier_name + '.bias']
-            strict = False
-        elif label_offset > 0:
-            for classifier_name in classifiers:
-                # special case for pretrained weights with an extra background class in pretrained weights
-                classifier_weight = state_dict[classifier_name + '.weight']
-                state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:]
-                classifier_bias = state_dict[classifier_name + '.bias']
-                state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
-
-    model.load_state_dict(state_dict, strict=strict)
-
-
-def extract_layer(model, layer):
-    layer = layer.split('.')
-    module = model
-    if hasattr(model, 'module') and layer[0] != 'module':
-        module = model.module
-    if not hasattr(model, 'module') and layer[0] == 'module':
-        layer = layer[1:]
-    for l in layer:
-        if hasattr(module, l):
-            if not l.isdigit():
-                module = getattr(module, l)
-            else:
-                module = module[int(l)]
-        else:
-            return module
-    return module
-
-
-def set_layer(model, layer, val):
-    layer = layer.split('.')
-    module = model
-    if hasattr(model, 'module') and layer[0] != 'module':
-        module = model.module
-    lst_index = 0
-    module2 = module
-    for l in layer:
-        if hasattr(module2, l):
-            if not l.isdigit():
-                module2 = getattr(module2, l)
-            else:
-                module2 = module2[int(l)]
-            lst_index += 1
-    lst_index -= 1
-    for l in layer[:lst_index]:
-        if not l.isdigit():
-            module = getattr(module, l)
-        else:
-            module = module[int(l)]
-    l = layer[lst_index]
-    setattr(module, l, val)
-
-
-def adapt_model_from_string(parent_module, model_string):
-    separator = '***'
-    state_dict = {}
-    lst_shape = model_string.split(separator)
-    for k in lst_shape:
-        k = k.split(':')
-        key = k[0]
-        shape = k[1][1:-1].split(',')
-        if shape[0] != '':
-            state_dict[key] = [int(i) for i in shape]
-
-    new_module = deepcopy(parent_module)
-    for n, m in parent_module.named_modules():
-        old_module = extract_layer(parent_module, n)
-        if isinstance(old_module, nn.Conv2d) or isinstance(old_module, Conv2dSame):
-            if isinstance(old_module, Conv2dSame):
-                conv = Conv2dSame
-            else:
-                conv = nn.Conv2d
-            s = state_dict[n + '.weight']
-            in_channels = s[1]
-            out_channels = s[0]
-            g = 1
-            if old_module.groups > 1:
-                in_channels = out_channels
-                g = in_channels
-            new_conv = conv(
-                in_channels=in_channels, out_channels=out_channels, kernel_size=old_module.kernel_size,
-                bias=old_module.bias is not None, padding=old_module.padding, dilation=old_module.dilation,
-                groups=g, stride=old_module.stride)
-            set_layer(new_module, n, new_conv)
-        if isinstance(old_module, nn.BatchNorm2d):
-            new_bn = nn.BatchNorm2d(
-                num_features=state_dict[n + '.weight'][0], eps=old_module.eps, momentum=old_module.momentum,
-                affine=old_module.affine, track_running_stats=True)
-            set_layer(new_module, n, new_bn)
-        if isinstance(old_module, nn.Linear):
-            # FIXME extra checks to ensure this is actually the FC classifier layer and not a diff Linear layer?
-            num_features = state_dict[n + '.weight'][1]
-            new_fc = Linear(
-                in_features=num_features, out_features=old_module.out_features, bias=old_module.bias is not None)
-            set_layer(new_module, n, new_fc)
-            if hasattr(new_module, 'num_features'):
-                new_module.num_features = num_features
-    new_module.eval()
-    parent_module.eval()
-
-    return new_module
-
-
-def adapt_model_from_file(parent_module, model_variant):
-    adapt_file = os.path.join(os.path.dirname(__file__), 'pruned', model_variant + '.txt')
-    with open(adapt_file, 'r') as f:
-        return adapt_model_from_string(parent_module, f.read().strip())
-
-
-def default_cfg_for_features(default_cfg):
-    default_cfg = deepcopy(default_cfg)
-    # remove default pretrained cfg fields that don't have much relevance for feature backbone
-    to_remove = ('num_classes', 'crop_pct', 'classifier', 'global_pool')  # add default final pool size?
-    for tr in to_remove:
-        default_cfg.pop(tr, None)
-    return default_cfg
-
-
-def overlay_external_default_cfg(default_cfg, kwargs):
-    """ Overlay 'external_default_cfg' in kwargs on top of default_cfg arg.
-    """
-    external_default_cfg = kwargs.pop('external_default_cfg', None)
-    if external_default_cfg:
-        default_cfg.pop('url', None)  # url should come from external cfg
-        default_cfg.pop('hf_hub', None)  # hf hub id should come from external cfg
-        default_cfg.update(external_default_cfg)
-
-
-def set_default_kwargs(kwargs, names, default_cfg):
-    for n in names:
-        # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
-        # default_cfg has one input_size=(C, H ,W) entry
-        if n == 'img_size':
-            input_size = default_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[-2:])
-        elif n == 'in_chans':
-            input_size = default_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[0])
-        else:
-            default_val = default_cfg.get(n, None)
-            if default_val is not None:
-                kwargs.setdefault(n, default_cfg[n])
-
-
-def filter_kwargs(kwargs, names):
-    if not kwargs or not names:
-        return
-    for n in names:
-        kwargs.pop(n, None)
-
-
-def update_default_cfg_and_kwargs(default_cfg, kwargs, kwargs_filter):
-    """ Update the default_cfg and kwargs before passing to model
-
-    FIXME this sequence of overlay default_cfg, set default kwargs, filter kwargs
-    could/should be replaced by an improved configuration mechanism
-
-    Args:
-        default_cfg: input default_cfg (updated in-place)
-        kwargs: keyword args passed to model build fn (updated in-place)
-        kwargs_filter: keyword arg keys that must be removed before model __init__
-    """
-    # Overlay default cfg values from `external_default_cfg` if it exists in kwargs
-    overlay_external_default_cfg(default_cfg, kwargs)
-    # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
-    set_default_kwargs(kwargs, names=('num_classes', 'global_pool', 'in_chans'), default_cfg=default_cfg)
-    # Filter keyword args for task specific model variants (some 'features only' models, etc.)
-    filter_kwargs(kwargs, names=kwargs_filter)
-
-
-def build_model_with_cfg(
-        model_cls: Callable,
-        variant: str,
-        pretrained: bool,
-        default_cfg: dict,
-        model_cfg: Optional[Any] = None,
-        feature_cfg: Optional[dict] = None,
-        pretrained_strict: bool = True,
-        pretrained_filter_fn: Optional[Callable] = None,
-        pretrained_custom_load: bool = False,
-        kwargs_filter: Optional[Tuple[str]] = None,
-        **kwargs):
-    """ Build model with specified default_cfg and optional model_cfg
-
-    This helper fn aids in the construction of a model including:
-      * handling default_cfg and associated pretained weight loading
-      * passing through optional model_cfg for models with config based arch spec
-      * features_only model adaptation
-      * pruning config / model adaptation
-
-    Args:
-        model_cls (nn.Module): model class
-        variant (str): model variant name
-        pretrained (bool): load pretrained weights
-        default_cfg (dict): model's default pretrained/task config
-        model_cfg (Optional[Dict]): model's architecture config
-        feature_cfg (Optional[Dict]: feature extraction adapter config
-        pretrained_strict (bool): load pretrained weights strictly
-        pretrained_filter_fn (Optional[Callable]): filter callable for pretrained weights
-        pretrained_custom_load (bool): use custom load fn, to load numpy or other non PyTorch weights
-        kwargs_filter (Optional[Tuple]): kwargs to filter before passing to model
-        **kwargs: model args passed through to model __init__
-    """
-    pruned = kwargs.pop('pruned', False)
-    features = False
-    feature_cfg = feature_cfg or {}
-    default_cfg = deepcopy(default_cfg) if default_cfg else {}
-    update_default_cfg_and_kwargs(default_cfg, kwargs, kwargs_filter)
-    default_cfg.setdefault('architecture', variant)
-
-    # Setup for feature extraction wrapper done at end of this fn
-    if kwargs.pop('features_only', False):
-        features = True
-        feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4))
-        if 'out_indices' in kwargs:
-            feature_cfg['out_indices'] = kwargs.pop('out_indices')
-
-    # Build the model
-    model = model_cls(**kwargs) if model_cfg is None else model_cls(cfg=model_cfg, **kwargs)
-    model.default_cfg = default_cfg
-    
-    if pruned:
-        model = adapt_model_from_file(model, variant)
-
-    # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
-    num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
-    if pretrained:
-        if pretrained_custom_load:
-            load_custom_pretrained(model)
-        else:
-            load_pretrained(
-                model,
-                num_classes=num_classes_pretrained,
-                in_chans=kwargs.get('in_chans', 3),
-                filter_fn=pretrained_filter_fn,
-                strict=pretrained_strict)
-
-    # Wrap the model in a feature extraction module if enabled
-    if features:
-        feature_cls = FeatureListNet
-        if 'feature_cls' in feature_cfg:
-            feature_cls = feature_cfg.pop('feature_cls')
-            if isinstance(feature_cls, str):
-                feature_cls = feature_cls.lower()
-                if 'hook' in feature_cls:
-                    feature_cls = FeatureHookNet
-                else:
-                    assert False, f'Unknown feature class {feature_cls}'
-        model = feature_cls(model, **feature_cfg)
-        model.default_cfg = default_cfg_for_features(default_cfg)  # add back default_cfg
-    
-    return model
-
-
-def model_parameters(model, exclude_head=False):
-    if exclude_head:
-        # FIXME this a bit of a quick and dirty hack to skip classifier head params based on ordering
-        return [p for p in model.parameters()][:-2]
-    else:
-        return model.parameters()
diff --git a/AVLFormer/src/timm/models/hrnet.py b/AVLFormer/src/timm/models/hrnet.py
deleted file mode 100644
index 75d214b..0000000
--- a/AVLFormer/src/timm/models/hrnet.py
+++ /dev/null
@@ -1,836 +0,0 @@
-""" HRNet
-
-Copied from https://github.com/HRNet/HRNet-Image-Classification
-
-Original header:
-  Copyright (c) Microsoft
-  Licensed under the MIT License.
-  Written by Bin Xiao (Bin.Xiao@microsoft.com)
-  Modified by Ke Sun (sunk@mail.ustc.edu.cn)
-"""
-import logging
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .features import FeatureInfo
-from .helpers import build_model_with_cfg, default_cfg_for_features
-from .layers import create_classifier
-from .registry import register_model
-from .resnet import BasicBlock, Bottleneck  # leveraging ResNet blocks w/ additional features like SE
-
-_BN_MOMENTUM = 0.1
-_logger = logging.getLogger(__name__)
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'hrnet_w18_small': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v1-f460c6bc.pth'),
-    'hrnet_w18_small_v2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v2-4c50a8cb.pth'),
-    'hrnet_w18': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w18-8cb57bb9.pth'),
-    'hrnet_w30': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w30-8d7f8dab.pth'),
-    'hrnet_w32': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w32-90d8c5fb.pth'),
-    'hrnet_w40': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w40-7cd397a4.pth'),
-    'hrnet_w44': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w44-c9ac8c18.pth'),
-    'hrnet_w48': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w48-abd2e6ab.pth'),
-    'hrnet_w64': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w64-b47cc881.pth'),
-}
-
-cfg_cls = dict(
-    hrnet_w18_small=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(1,),
-            NUM_CHANNELS=(32,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2),
-            NUM_CHANNELS=(16, 32),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2, 2),
-            NUM_CHANNELS=(16, 32, 64),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2, 2, 2),
-            NUM_CHANNELS=(16, 32, 64, 128),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w18_small_v2=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(2,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2),
-            NUM_CHANNELS=(18, 36),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2, 2),
-            NUM_CHANNELS=(18, 36, 72),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=2,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(2, 2, 2, 2),
-            NUM_CHANNELS=(18, 36, 72, 144),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w18=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(18, 36),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(18, 36, 72),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(18, 36, 72, 144),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w30=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(30, 60),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(30, 60, 120),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(30, 60, 120, 240),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w32=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(32, 64),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(32, 64, 128),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(32, 64, 128, 256),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w40=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(40, 80),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(40, 80, 160),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(40, 80, 160, 320),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w44=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(44, 88),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(44, 88, 176),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(44, 88, 176, 352),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w48=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(48, 96),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(48, 96, 192),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(48, 96, 192, 384),
-            FUSE_METHOD='SUM',
-        ),
-    ),
-
-    hrnet_w64=dict(
-        STEM_WIDTH=64,
-        STAGE1=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=1,
-            BLOCK='BOTTLENECK',
-            NUM_BLOCKS=(4,),
-            NUM_CHANNELS=(64,),
-            FUSE_METHOD='SUM',
-        ),
-        STAGE2=dict(
-            NUM_MODULES=1,
-            NUM_BRANCHES=2,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4),
-            NUM_CHANNELS=(64, 128),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE3=dict(
-            NUM_MODULES=4,
-            NUM_BRANCHES=3,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4),
-            NUM_CHANNELS=(64, 128, 256),
-            FUSE_METHOD='SUM'
-        ),
-        STAGE4=dict(
-            NUM_MODULES=3,
-            NUM_BRANCHES=4,
-            BLOCK='BASIC',
-            NUM_BLOCKS=(4, 4, 4, 4),
-            NUM_CHANNELS=(64, 128, 256, 512),
-            FUSE_METHOD='SUM',
-        ),
-    )
-)
-
-
-class HighResolutionModule(nn.Module):
-    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
-                 num_channels, fuse_method, multi_scale_output=True):
-        super(HighResolutionModule, self).__init__()
-        self._check_branches(
-            num_branches, blocks, num_blocks, num_inchannels, num_channels)
-
-        self.num_inchannels = num_inchannels
-        self.fuse_method = fuse_method
-        self.num_branches = num_branches
-
-        self.multi_scale_output = multi_scale_output
-
-        self.branches = self._make_branches(
-            num_branches, blocks, num_blocks, num_channels)
-        self.fuse_layers = self._make_fuse_layers()
-        self.fuse_act = nn.ReLU(False)
-
-    def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
-        error_msg = ''
-        if num_branches != len(num_blocks):
-            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(num_branches, len(num_blocks))
-        elif num_branches != len(num_channels):
-            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(num_branches, len(num_channels))
-        elif num_branches != len(num_inchannels):
-            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(num_branches, len(num_inchannels))
-        if error_msg:
-            _logger.error(error_msg)
-            raise ValueError(error_msg)
-
-    def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1):
-        downsample = None
-        if stride != 1 or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(
-                    self.num_inchannels[branch_index], num_channels[branch_index] * block.expansion,
-                    kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=_BN_MOMENTUM),
-            )
-
-        layers = [block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)]
-        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
-        for i in range(1, num_blocks[branch_index]):
-            layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
-
-        return nn.Sequential(*layers)
-
-    def _make_branches(self, num_branches, block, num_blocks, num_channels):
-        branches = []
-        for i in range(num_branches):
-            branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
-
-        return nn.ModuleList(branches)
-
-    def _make_fuse_layers(self):
-        if self.num_branches == 1:
-            return nn.Identity()
-
-        num_branches = self.num_branches
-        num_inchannels = self.num_inchannels
-        fuse_layers = []
-        for i in range(num_branches if self.multi_scale_output else 1):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(nn.Sequential(
-                        nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
-                        nn.BatchNorm2d(num_inchannels[i], momentum=_BN_MOMENTUM),
-                        nn.Upsample(scale_factor=2 ** (j - i), mode='nearest')))
-                elif j == i:
-                    fuse_layer.append(nn.Identity())
-                else:
-                    conv3x3s = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            num_outchannels_conv3x3 = num_inchannels[i]
-                            conv3x3s.append(nn.Sequential(
-                                nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
-                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM)))
-                        else:
-                            num_outchannels_conv3x3 = num_inchannels[j]
-                            conv3x3s.append(nn.Sequential(
-                                nn.Conv2d(num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False),
-                                nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM),
-                                nn.ReLU(False)))
-                    fuse_layer.append(nn.Sequential(*conv3x3s))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-
-        return nn.ModuleList(fuse_layers)
-
-    def get_num_inchannels(self):
-        return self.num_inchannels
-
-    def forward(self, x: List[torch.Tensor]):
-        if self.num_branches == 1:
-            return [self.branches[0](x[0])]
-
-        for i, branch in enumerate(self.branches):
-            x[i] = branch(x[i])
-
-        x_fuse = []
-        for i, fuse_outer in enumerate(self.fuse_layers):
-            y = x[0] if i == 0 else fuse_outer[0](x[0])
-            for j in range(1, self.num_branches):
-                if i == j:
-                    y = y + x[j]
-                else:
-                    y = y + fuse_outer[j](x[j])
-            x_fuse.append(self.fuse_act(y))
-
-        return x_fuse
-
-
-blocks_dict = {
-    'BASIC': BasicBlock,
-    'BOTTLENECK': Bottleneck
-}
-
-
-class HighResolutionNet(nn.Module):
-
-    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0.0, head='classification'):
-        super(HighResolutionNet, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-
-        stem_width = cfg['STEM_WIDTH']
-        self.conv1 = nn.Conv2d(in_chans, stem_width, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(stem_width, momentum=_BN_MOMENTUM)
-        self.act1 = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(stem_width, 64, kernel_size=3, stride=2, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(64, momentum=_BN_MOMENTUM)
-        self.act2 = nn.ReLU(inplace=True)
-
-        self.stage1_cfg = cfg['STAGE1']
-        num_channels = self.stage1_cfg['NUM_CHANNELS'][0]
-        block = blocks_dict[self.stage1_cfg['BLOCK']]
-        num_blocks = self.stage1_cfg['NUM_BLOCKS'][0]
-        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
-        stage1_out_channel = block.expansion * num_channels
-
-        self.stage2_cfg = cfg['STAGE2']
-        num_channels = self.stage2_cfg['NUM_CHANNELS']
-        block = blocks_dict[self.stage2_cfg['BLOCK']]
-        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
-        self.transition1 = self._make_transition_layer([stage1_out_channel], num_channels)
-        self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
-
-        self.stage3_cfg = cfg['STAGE3']
-        num_channels = self.stage3_cfg['NUM_CHANNELS']
-        block = blocks_dict[self.stage3_cfg['BLOCK']]
-        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
-        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
-        self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
-
-        self.stage4_cfg = cfg['STAGE4']
-        num_channels = self.stage4_cfg['NUM_CHANNELS']
-        block = blocks_dict[self.stage4_cfg['BLOCK']]
-        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
-        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
-        self.stage4, pre_stage_channels = self._make_stage(self.stage4_cfg, num_channels, multi_scale_output=True)
-
-        self.head = head
-        self.head_channels = None  # set if _make_head called
-        if head == 'classification':
-            # Classification Head
-            self.num_features = 2048
-            self.incre_modules, self.downsamp_modules, self.final_layer = self._make_head(pre_stage_channels)
-            self.global_pool, self.classifier = create_classifier(
-                self.num_features, self.num_classes, pool_type=global_pool)
-        elif head == 'incre':
-            self.num_features = 2048
-            self.incre_modules, _, _ = self._make_head(pre_stage_channels, True)
-        else:
-            self.incre_modules = None
-            self.num_features = 256
-
-        curr_stride = 2
-        # module names aren't actually valid here, hook or FeatureNet based extraction would not work
-        self.feature_info = [dict(num_chs=64, reduction=curr_stride, module='stem')]
-        for i, c in enumerate(self.head_channels if self.head_channels else num_channels):
-            curr_stride *= 2
-            c = c * 4 if self.head_channels else c  # head block expansion factor of 4
-            self.feature_info += [dict(num_chs=c, reduction=curr_stride, module=f'stage{i + 1}')]
-
-        self.init_weights()
-
-    def _make_head(self, pre_stage_channels, incre_only=False):
-        head_block = Bottleneck
-        self.head_channels = [32, 64, 128, 256]
-
-        # Increasing the #channels on each resolution
-        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
-        incre_modules = []
-        for i, channels in enumerate(pre_stage_channels):
-            incre_modules.append(self._make_layer(head_block, channels, self.head_channels[i], 1, stride=1))
-        incre_modules = nn.ModuleList(incre_modules)
-        if incre_only:
-            return incre_modules, None, None
-
-        # downsampling modules
-        downsamp_modules = []
-        for i in range(len(pre_stage_channels) - 1):
-            in_channels = self.head_channels[i] * head_block.expansion
-            out_channels = self.head_channels[i + 1] * head_block.expansion
-            downsamp_module = nn.Sequential(
-                nn.Conv2d(
-                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1),
-                nn.BatchNorm2d(out_channels, momentum=_BN_MOMENTUM),
-                nn.ReLU(inplace=True)
-            )
-            downsamp_modules.append(downsamp_module)
-        downsamp_modules = nn.ModuleList(downsamp_modules)
-
-        final_layer = nn.Sequential(
-            nn.Conv2d(
-                in_channels=self.head_channels[3] * head_block.expansion,
-                out_channels=self.num_features, kernel_size=1, stride=1, padding=0
-            ),
-            nn.BatchNorm2d(self.num_features, momentum=_BN_MOMENTUM),
-            nn.ReLU(inplace=True)
-        )
-
-        return incre_modules, downsamp_modules, final_layer
-
-    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
-        num_branches_cur = len(num_channels_cur_layer)
-        num_branches_pre = len(num_channels_pre_layer)
-
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(nn.Sequential(
-                        nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False),
-                        nn.BatchNorm2d(num_channels_cur_layer[i], momentum=_BN_MOMENTUM),
-                        nn.ReLU(inplace=True)))
-                else:
-                    transition_layers.append(nn.Identity())
-            else:
-                conv3x3s = []
-                for j in range(i + 1 - num_branches_pre):
-                    inchannels = num_channels_pre_layer[-1]
-                    outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
-                    conv3x3s.append(nn.Sequential(
-                        nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
-                        nn.BatchNorm2d(outchannels, momentum=_BN_MOMENTUM),
-                        nn.ReLU(inplace=True)))
-                transition_layers.append(nn.Sequential(*conv3x3s))
-
-        return nn.ModuleList(transition_layers)
-
-    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion, momentum=_BN_MOMENTUM),
-            )
-
-        layers = [block(inplanes, planes, stride, downsample)]
-        inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
-        num_modules = layer_config['NUM_MODULES']
-        num_branches = layer_config['NUM_BRANCHES']
-        num_blocks = layer_config['NUM_BLOCKS']
-        num_channels = layer_config['NUM_CHANNELS']
-        block = blocks_dict[layer_config['BLOCK']]
-        fuse_method = layer_config['FUSE_METHOD']
-
-        modules = []
-        for i in range(num_modules):
-            # multi_scale_output is only used last module
-            reset_multi_scale_output = multi_scale_output or i < num_modules - 1
-            modules.append(HighResolutionModule(
-                num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output)
-            )
-            num_inchannels = modules[-1].get_num_inchannels()
-
-        return nn.Sequential(*modules), num_inchannels
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(
-                    m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.classifier = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def stages(self, x) -> List[torch.Tensor]:
-        x = self.layer1(x)
-
-        xl = [t(x) for i, t in enumerate(self.transition1)]
-        yl = self.stage2(xl)
-
-        xl = [t(yl[-1]) if not isinstance(t, nn.Identity) else yl[i] for i, t in enumerate(self.transition2)]
-        yl = self.stage3(xl)
-
-        xl = [t(yl[-1]) if not isinstance(t, nn.Identity) else yl[i] for i, t in enumerate(self.transition3)]
-        yl = self.stage4(xl)
-        return yl
-
-    def forward_features(self, x):
-        # Stem
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.conv2(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Stages
-        yl = self.stages(x)
-
-        # Classification Head
-        y = self.incre_modules[0](yl[0])
-        for i, down in enumerate(self.downsamp_modules):
-            y = self.incre_modules[i + 1](yl[i + 1]) + down(y)
-        y = self.final_layer(y)
-        return y
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.classifier(x)
-        return x
-
-
-class HighResolutionNetFeatures(HighResolutionNet):
-    """HighResolutionNet feature extraction
-
-    The design of HRNet makes it easy to grab feature maps, this class provides a simple wrapper to do so.
-    It would be more complicated to use the FeatureNet helpers.
-
-    The `feature_location=incre` allows grabbing increased channel count features using part of the
-    classification head. If `feature_location=''` the default HRNet features are returned. First stem
-    conv is used for stride 2 features.
-    """
-
-    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0.0,
-                 feature_location='incre', out_indices=(0, 1, 2, 3, 4)):
-        assert feature_location in ('incre', '')
-        super(HighResolutionNetFeatures, self).__init__(
-            cfg, in_chans=in_chans, num_classes=num_classes, global_pool=global_pool,
-            drop_rate=drop_rate, head=feature_location)
-        self.feature_info = FeatureInfo(self.feature_info, out_indices)
-        self._out_idx = {i for i in out_indices}
-
-    def forward_features(self, x):
-        assert False, 'Not supported'
-
-    def forward(self, x) -> List[torch.tensor]:
-        out = []
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        if 0 in self._out_idx:
-            out.append(x)
-        x = self.conv2(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-        x = self.stages(x)
-        if self.incre_modules is not None:
-            x = [incre(f) for f, incre in zip(x, self.incre_modules)]
-        for i, f in enumerate(x):
-            if i + 1 in self._out_idx:
-                out.append(f)
-        return out
-
-
-def _create_hrnet(variant, pretrained, **model_kwargs):
-    model_cls = HighResolutionNet
-    features_only = False
-    kwargs_filter = None
-    if model_kwargs.pop('features_only', False):
-        model_cls = HighResolutionNetFeatures
-        kwargs_filter = ('num_classes', 'global_pool')
-        features_only = True
-    model = build_model_with_cfg(
-        model_cls, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=cfg_cls[variant],
-        pretrained_strict=not features_only,
-        kwargs_filter=kwargs_filter,
-        **model_kwargs)
-    if features_only:
-        model.default_cfg = default_cfg_for_features(model.default_cfg)
-    return model
-
-
-@register_model
-def hrnet_w18_small(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w18_small', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w18_small_v2(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w18_small_v2', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w18(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w18', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w30(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w30', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w32(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w32', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w40(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w40', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w44(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w44', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w48(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w48', pretrained, **kwargs)
-
-
-@register_model
-def hrnet_w64(pretrained=True, **kwargs):
-    return _create_hrnet('hrnet_w64', pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/hub.py b/AVLFormer/src/timm/models/hub.py
deleted file mode 100644
index 7842d37..0000000
--- a/AVLFormer/src/timm/models/hub.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import json
-import logging
-import os
-from functools import partial
-from typing import Union, Optional
-
-import torch
-from torch.hub import load_state_dict_from_url, download_url_to_file, urlparse, HASH_REGEX
-try:
-    from torch.hub import get_dir
-except ImportError:
-    from torch.hub import _get_torch_home as get_dir
-
-from src.timm import __version__
-try:
-    from huggingface_hub import hf_hub_url
-    from huggingface_hub import cached_download
-    cached_download = partial(cached_download, library_name="timm", library_version=__version__)
-except ImportError:
-    hf_hub_url = None
-    cached_download = None
-
-_logger = logging.getLogger(__name__)
-
-
-def get_cache_dir(child_dir=''):
-    """
-    Returns the location of the directory where models are cached (and creates it if necessary).
-    """
-    # Issue warning to move data if old env is set
-    if os.getenv('TORCH_MODEL_ZOO'):
-        _logger.warning('TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead')
-
-    hub_dir = get_dir()
-    child_dir = () if not child_dir else (child_dir,)
-    model_dir = os.path.join(hub_dir, 'checkpoints', *child_dir)
-    os.makedirs(model_dir, exist_ok=True)
-    return model_dir
-
-
-def download_cached_file(url, check_hash=True, progress=False):
-    parts = urlparse(url)
-    filename = os.path.basename(parts.path)
-    cached_file = os.path.join(get_cache_dir(), filename)
-    if not os.path.exists(cached_file):
-        _logger.info('Downloading: "{}" to {}\n'.format(url, cached_file))
-        hash_prefix = None
-        if check_hash:
-            r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
-            hash_prefix = r.group(1) if r else None
-        download_url_to_file(url, cached_file, hash_prefix, progress=progress)
-    return cached_file
-
-
-def has_hf_hub(necessary=False):
-    if hf_hub_url is None and necessary:
-        # if no HF Hub module installed and it is necessary to continue, raise error
-        raise RuntimeError(
-            'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
-    return hf_hub_url is not None
-
-
-def hf_split(hf_id):
-    rev_split = hf_id.split('@')
-    assert 0 < len(rev_split) <= 2, 'hf_hub id should only contain one @ character to identify revision.'
-    hf_model_id = rev_split[0]
-    hf_revision = rev_split[-1] if len(rev_split) > 1 else None
-    return hf_model_id, hf_revision
-
-
-def load_cfg_from_json(json_file: Union[str, os.PathLike]):
-    with open(json_file, "r", encoding="utf-8") as reader:
-        text = reader.read()
-    return json.loads(text)
-
-
-def _download_from_hf(model_id: str, filename: str):
-    hf_model_id, hf_revision = hf_split(model_id)
-    url = hf_hub_url(hf_model_id, filename, revision=hf_revision)
-    return cached_download(url, cache_dir=get_cache_dir('hf'))
-
-
-def load_model_config_from_hf(model_id: str):
-    assert has_hf_hub(True)
-    cached_file = _download_from_hf(model_id, 'config.json')
-    default_cfg = load_cfg_from_json(cached_file)
-    default_cfg['hf_hub'] = model_id  # insert hf_hub id for pretrained weight load during model creation
-    model_name = default_cfg.get('architecture')
-    return default_cfg, model_name
-
-
-def load_state_dict_from_hf(model_id: str):
-    assert has_hf_hub(True)
-    cached_file = _download_from_hf(model_id, 'pytorch_model.bin')
-    state_dict = torch.load(cached_file, map_location='cpu')
-    return state_dict
diff --git a/AVLFormer/src/timm/models/inception_resnet_v2.py b/AVLFormer/src/timm/models/inception_resnet_v2.py
deleted file mode 100644
index 431f716..0000000
--- a/AVLFormer/src/timm/models/inception_resnet_v2.py
+++ /dev/null
@@ -1,358 +0,0 @@
-""" Pytorch Inception-Resnet-V2 implementation
-Sourced from https://github.com/Cadene/tensorflow-model-zoo.torch (MIT License) which is
-based upon Google's Tensorflow implementation and pretrained weights (Apache 2.0 License)
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['InceptionResnetV2']
-
-default_cfgs = {
-    # ported from http://download.tensorflow.org/models/inception_resnet_v2_2016_08_30.tar.gz
-    'inception_resnet_v2': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/inception_resnet_v2-940b1cd6.pth',
-        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8),
-        'crop_pct': 0.8975, 'interpolation': 'bicubic',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'conv2d_1a.conv', 'classifier': 'classif',
-        'label_offset': 1,  # 1001 classes in pretrained weights
-    },
-    # ported from http://download.tensorflow.org/models/ens_adv_inception_resnet_v2_2017_08_18.tar.gz
-    'ens_adv_inception_resnet_v2': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ens_adv_inception_resnet_v2-2592a550.pth',
-        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8),
-        'crop_pct': 0.8975, 'interpolation': 'bicubic',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'conv2d_1a.conv', 'classifier': 'classif',
-        'label_offset': 1,  # 1001 classes in pretrained weights
-    }
-}
-
-
-class BasicConv2d(nn.Module):
-    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
-        super(BasicConv2d, self).__init__()
-        self.conv = nn.Conv2d(
-            in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
-        self.bn = nn.BatchNorm2d(out_planes, eps=.001)
-        self.relu = nn.ReLU(inplace=False)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = self.relu(x)
-        return x
-
-
-class Mixed_5b(nn.Module):
-    def __init__(self):
-        super(Mixed_5b, self).__init__()
-
-        self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(192, 48, kernel_size=1, stride=1),
-            BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2)
-        )
-
-        self.branch2 = nn.Sequential(
-            BasicConv2d(192, 64, kernel_size=1, stride=1),
-            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
-        )
-
-        self.branch3 = nn.Sequential(
-            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
-            BasicConv2d(192, 64, kernel_size=1, stride=1)
-        )
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        x3 = self.branch3(x)
-        out = torch.cat((x0, x1, x2, x3), 1)
-        return out
-
-
-class Block35(nn.Module):
-    def __init__(self, scale=1.0):
-        super(Block35, self).__init__()
-
-        self.scale = scale
-
-        self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(320, 32, kernel_size=1, stride=1),
-            BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1)
-        )
-
-        self.branch2 = nn.Sequential(
-            BasicConv2d(320, 32, kernel_size=1, stride=1),
-            BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1)
-        )
-
-        self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)
-        self.relu = nn.ReLU(inplace=False)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        out = torch.cat((x0, x1, x2), 1)
-        out = self.conv2d(out)
-        out = out * self.scale + x
-        out = self.relu(out)
-        return out
-
-
-class Mixed_6a(nn.Module):
-    def __init__(self):
-        super(Mixed_6a, self).__init__()
-
-        self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(320, 256, kernel_size=1, stride=1),
-            BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(256, 384, kernel_size=3, stride=2)
-        )
-
-        self.branch2 = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        out = torch.cat((x0, x1, x2), 1)
-        return out
-
-
-class Block17(nn.Module):
-    def __init__(self, scale=1.0):
-        super(Block17, self).__init__()
-
-        self.scale = scale
-
-        self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(1088, 128, kernel_size=1, stride=1),
-            BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)),
-            BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0))
-        )
-
-        self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)
-        self.relu = nn.ReLU(inplace=False)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        out = torch.cat((x0, x1), 1)
-        out = self.conv2d(out)
-        out = out * self.scale + x
-        out = self.relu(out)
-        return out
-
-
-class Mixed_7a(nn.Module):
-    def __init__(self):
-        super(Mixed_7a, self).__init__()
-
-        self.branch0 = nn.Sequential(
-            BasicConv2d(1088, 256, kernel_size=1, stride=1),
-            BasicConv2d(256, 384, kernel_size=3, stride=2)
-        )
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(1088, 256, kernel_size=1, stride=1),
-            BasicConv2d(256, 288, kernel_size=3, stride=2)
-        )
-
-        self.branch2 = nn.Sequential(
-            BasicConv2d(1088, 256, kernel_size=1, stride=1),
-            BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(288, 320, kernel_size=3, stride=2)
-        )
-
-        self.branch3 = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        x3 = self.branch3(x)
-        out = torch.cat((x0, x1, x2, x3), 1)
-        return out
-
-
-class Block8(nn.Module):
-
-    def __init__(self, scale=1.0, no_relu=False):
-        super(Block8, self).__init__()
-
-        self.scale = scale
-
-        self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(2080, 192, kernel_size=1, stride=1),
-            BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)),
-            BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))
-        )
-
-        self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)
-        self.relu = None if no_relu else nn.ReLU(inplace=False)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        out = torch.cat((x0, x1), 1)
-        out = self.conv2d(out)
-        out = out * self.scale + x
-        if self.relu is not None:
-            out = self.relu(out)
-        return out
-
-
-class InceptionResnetV2(nn.Module):
-    def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., output_stride=32, global_pool='avg'):
-        super(InceptionResnetV2, self).__init__()
-        self.drop_rate = drop_rate
-        self.num_classes = num_classes
-        self.num_features = 1536
-        assert output_stride == 32
-
-        self.conv2d_1a = BasicConv2d(in_chans, 32, kernel_size=3, stride=2)
-        self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)
-        self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)
-        self.feature_info = [dict(num_chs=64, reduction=2, module='conv2d_2b')]
-
-        self.maxpool_3a = nn.MaxPool2d(3, stride=2)
-        self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
-        self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
-        self.feature_info += [dict(num_chs=192, reduction=4, module='conv2d_4a')]
-
-        self.maxpool_5a = nn.MaxPool2d(3, stride=2)
-        self.mixed_5b = Mixed_5b()
-        self.repeat = nn.Sequential(
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17),
-            Block35(scale=0.17)
-        )
-        self.feature_info += [dict(num_chs=320, reduction=8, module='repeat')]
-
-        self.mixed_6a = Mixed_6a()
-        self.repeat_1 = nn.Sequential(
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10),
-            Block17(scale=0.10)
-        )
-        self.feature_info += [dict(num_chs=1088, reduction=16, module='repeat_1')]
-
-        self.mixed_7a = Mixed_7a()
-        self.repeat_2 = nn.Sequential(
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20),
-            Block8(scale=0.20)
-        )
-        self.block8 = Block8(no_relu=True)
-        self.conv2d_7b = BasicConv2d(2080, self.num_features, kernel_size=1, stride=1)
-        self.feature_info += [dict(num_chs=self.num_features, reduction=32, module='conv2d_7b')]
-
-        self.global_pool, self.classif = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def get_classifier(self):
-        return self.classif
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.classif = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.conv2d_1a(x)
-        x = self.conv2d_2a(x)
-        x = self.conv2d_2b(x)
-        x = self.maxpool_3a(x)
-        x = self.conv2d_3b(x)
-        x = self.conv2d_4a(x)
-        x = self.maxpool_5a(x)
-        x = self.mixed_5b(x)
-        x = self.repeat(x)
-        x = self.mixed_6a(x)
-        x = self.repeat_1(x)
-        x = self.mixed_7a(x)
-        x = self.repeat_2(x)
-        x = self.block8(x)
-        x = self.conv2d_7b(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.classif(x)
-        return x
-
-
-def _create_inception_resnet_v2(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        InceptionResnetV2, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def inception_resnet_v2(pretrained=False, **kwargs):
-    r"""InceptionResnetV2 model architecture from the
-    `"InceptionV4, Inception-ResNet..." <https://arxiv.org/abs/1602.07261>` paper.
-    """
-    return _create_inception_resnet_v2('inception_resnet_v2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ens_adv_inception_resnet_v2(pretrained=False, **kwargs):
-    r""" Ensemble Adversarially trained InceptionResnetV2 model architecture
-    As per https://arxiv.org/abs/1705.07204 and
-    https://github.com/tensorflow/models/tree/master/research/adv_imagenet_models.
-    """
-    return _create_inception_resnet_v2('ens_adv_inception_resnet_v2', pretrained=pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/inception_v3.py b/AVLFormer/src/timm/models/inception_v3.py
deleted file mode 100644
index 53b81a0..0000000
--- a/AVLFormer/src/timm/models/inception_v3.py
+++ /dev/null
@@ -1,470 +0,0 @@
-""" Inception-V3
-
-Originally from torchvision Inception3 model
-Licensed BSD-Clause 3 https://github.com/pytorch/vision/blob/master/LICENSE
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .helpers import build_model_with_cfg
-from .registry import register_model
-from .layers import trunc_normal_, create_classifier, Linear
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'Conv2d_1a_3x3.conv', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # original PyTorch weights, ported from Tensorflow but modified
-    'inception_v3': _cfg(
-        url='https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
-        has_aux=True),  # checkpoint has aux logit layer weights
-    # my port of Tensorflow SLIM weights (http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz)
-    'tf_inception_v3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_inception_v3-e0069de4.pth',
-        num_classes=1000, has_aux=False, label_offset=1),
-    # my port of Tensorflow adversarially trained Inception V3 from
-    # http://download.tensorflow.org/models/adv_inception_v3_2017_08_18.tar.gz
-    'adv_inception_v3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/adv_inception_v3-9e27bd63.pth',
-        num_classes=1000, has_aux=False, label_offset=1),
-    # from gluon pretrained models, best performing in terms of accuracy/loss metrics
-    # https://gluon-cv.mxnet.io/model_zoo/classification.html
-    'gluon_inception_v3': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_inception_v3-9f746940.pth',
-        mean=IMAGENET_DEFAULT_MEAN,  # also works well with inception defaults
-        std=IMAGENET_DEFAULT_STD,  # also works well with inception defaults
-        has_aux=False,
-    )
-}
-
-
-class InceptionA(nn.Module):
-
-    def __init__(self, in_channels, pool_features, conv_block=None):
-        super(InceptionA, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
-
-        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
-        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
-
-        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
-
-    def _forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x):
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionB(nn.Module):
-
-    def __init__(self, in_channels, conv_block=None):
-        super(InceptionB, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
-
-    def _forward(self, x):
-        branch3x3 = self.branch3x3(x)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-
-        outputs = [branch3x3, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x):
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionC(nn.Module):
-
-    def __init__(self, in_channels, channels_7x7, conv_block=None):
-        super(InceptionC, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
-
-        c7 = channels_7x7
-        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
-        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
-
-        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
-        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
-
-        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
-
-    def _forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
-        return outputs
-
-    def forward(self, x):
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionD(nn.Module):
-
-    def __init__(self, in_channels, conv_block=None):
-        super(InceptionD, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
-        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
-
-        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
-        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
-
-    def _forward(self, x):
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = self.branch3x3_2(branch3x3)
-
-        branch7x7x3 = self.branch7x7x3_1(x)
-        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-        outputs = [branch3x3, branch7x7x3, branch_pool]
-        return outputs
-
-    def forward(self, x):
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionE(nn.Module):
-
-    def __init__(self, in_channels, conv_block=None):
-        super(InceptionE, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
-
-        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
-        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
-        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
-        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
-
-    def _forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = torch.cat(branch3x3, 1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = torch.cat(branch3x3dbl, 1)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return outputs
-
-    def forward(self, x):
-        outputs = self._forward(x)
-        return torch.cat(outputs, 1)
-
-
-class InceptionAux(nn.Module):
-
-    def __init__(self, in_channels, num_classes, conv_block=None):
-        super(InceptionAux, self).__init__()
-        if conv_block is None:
-            conv_block = BasicConv2d
-        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
-        self.conv1 = conv_block(128, 768, kernel_size=5)
-        self.conv1.stddev = 0.01
-        self.fc = Linear(768, num_classes)
-        self.fc.stddev = 0.001
-
-    def forward(self, x):
-        # N x 768 x 17 x 17
-        x = F.avg_pool2d(x, kernel_size=5, stride=3)
-        # N x 768 x 5 x 5
-        x = self.conv0(x)
-        # N x 128 x 5 x 5
-        x = self.conv1(x)
-        # N x 768 x 1 x 1
-        # Adaptive average pooling
-        x = F.adaptive_avg_pool2d(x, (1, 1))
-        # N x 768 x 1 x 1
-        x = torch.flatten(x, 1)
-        # N x 768
-        x = self.fc(x)
-        # N x 1000
-        return x
-
-
-class BasicConv2d(nn.Module):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(BasicConv2d, self).__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        return F.relu(x, inplace=True)
-
-
-class InceptionV3(nn.Module):
-    """Inception-V3 with no AuxLogits
-    FIXME two class defs are redundant, but less screwing around with torchsript fussyness and inconsistent returns
-    """
-
-    def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg', aux_logits=False):
-        super(InceptionV3, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        self.aux_logits = aux_logits
-
-        self.Conv2d_1a_3x3 = BasicConv2d(in_chans, 32, kernel_size=3, stride=2)
-        self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
-        self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
-        self.Pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
-        self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
-        self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
-        self.Pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
-        self.Mixed_5b = InceptionA(192, pool_features=32)
-        self.Mixed_5c = InceptionA(256, pool_features=64)
-        self.Mixed_5d = InceptionA(288, pool_features=64)
-        self.Mixed_6a = InceptionB(288)
-        self.Mixed_6b = InceptionC(768, channels_7x7=128)
-        self.Mixed_6c = InceptionC(768, channels_7x7=160)
-        self.Mixed_6d = InceptionC(768, channels_7x7=160)
-        self.Mixed_6e = InceptionC(768, channels_7x7=192)
-        if aux_logits:
-            self.AuxLogits = InceptionAux(768, num_classes)
-        else:
-            self.AuxLogits = None
-        self.Mixed_7a = InceptionD(768)
-        self.Mixed_7b = InceptionE(1280)
-        self.Mixed_7c = InceptionE(2048)
-        self.feature_info = [
-            dict(num_chs=64, reduction=2, module='Conv2d_2b_3x3'),
-            dict(num_chs=192, reduction=4, module='Conv2d_4a_3x3'),
-            dict(num_chs=288, reduction=8, module='Mixed_5d'),
-            dict(num_chs=768, reduction=16, module='Mixed_6e'),
-            dict(num_chs=2048, reduction=32, module='Mixed_7c'),
-        ]
-
-        self.num_features = 2048
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                stddev = m.stddev if hasattr(m, 'stddev') else 0.1
-                trunc_normal_(m.weight, std=stddev)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-    def forward_preaux(self, x):
-        # N x 3 x 299 x 299
-        x = self.Conv2d_1a_3x3(x)
-        # N x 32 x 149 x 149
-        x = self.Conv2d_2a_3x3(x)
-        # N x 32 x 147 x 147
-        x = self.Conv2d_2b_3x3(x)
-        # N x 64 x 147 x 147
-        x = self.Pool1(x)
-        # N x 64 x 73 x 73
-        x = self.Conv2d_3b_1x1(x)
-        # N x 80 x 73 x 73
-        x = self.Conv2d_4a_3x3(x)
-        # N x 192 x 71 x 71
-        x = self.Pool2(x)
-        # N x 192 x 35 x 35
-        x = self.Mixed_5b(x)
-        # N x 256 x 35 x 35
-        x = self.Mixed_5c(x)
-        # N x 288 x 35 x 35
-        x = self.Mixed_5d(x)
-        # N x 288 x 35 x 35
-        x = self.Mixed_6a(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6b(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6c(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6d(x)
-        # N x 768 x 17 x 17
-        x = self.Mixed_6e(x)
-        # N x 768 x 17 x 17
-        return x
-
-    def forward_postaux(self, x):
-        x = self.Mixed_7a(x)
-        # N x 1280 x 8 x 8
-        x = self.Mixed_7b(x)
-        # N x 2048 x 8 x 8
-        x = self.Mixed_7c(x)
-        # N x 2048 x 8 x 8
-        return x
-
-    def forward_features(self, x):
-        x = self.forward_preaux(x)
-        x = self.forward_postaux(x)
-        return x
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x
-
-
-class InceptionV3Aux(InceptionV3):
-    """InceptionV3 with AuxLogits
-    """
-
-    def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg', aux_logits=True):
-        super(InceptionV3Aux, self).__init__(
-            num_classes, in_chans, drop_rate, global_pool, aux_logits)
-
-    def forward_features(self, x):
-        x = self.forward_preaux(x)
-        aux = self.AuxLogits(x) if self.training else None
-        x = self.forward_postaux(x)
-        return x, aux
-
-    def forward(self, x):
-        x, aux = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x, aux
-
-
-def _create_inception_v3(variant, pretrained=False, **kwargs):
-    default_cfg = default_cfgs[variant]
-    aux_logits = kwargs.pop('aux_logits', False)
-    if aux_logits:
-        assert not kwargs.pop('features_only', False)
-        model_cls = InceptionV3Aux
-        load_strict = default_cfg['has_aux']
-    else:
-        model_cls = InceptionV3
-        load_strict = not default_cfg['has_aux']
-    return build_model_with_cfg(
-        model_cls, variant, pretrained,
-        default_cfg=default_cfg,
-        pretrained_strict=load_strict,
-        **kwargs)
-
-
-@register_model
-def inception_v3(pretrained=False, **kwargs):
-    # original PyTorch weights, ported from Tensorflow but modified
-    model = _create_inception_v3('inception_v3', pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_inception_v3(pretrained=False, **kwargs):
-    # my port of Tensorflow SLIM weights (http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz)
-    model = _create_inception_v3('tf_inception_v3', pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def adv_inception_v3(pretrained=False, **kwargs):
-    # my port of Tensorflow adversarially trained Inception V3 from
-    # http://download.tensorflow.org/models/adv_inception_v3_2017_08_18.tar.gz
-    model = _create_inception_v3('adv_inception_v3', pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def gluon_inception_v3(pretrained=False, **kwargs):
-    # from gluon pretrained models, best performing in terms of accuracy/loss metrics
-    # https://gluon-cv.mxnet.io/model_zoo/classification.html
-    model = _create_inception_v3('gluon_inception_v3', pretrained=pretrained, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/inception_v4.py b/AVLFormer/src/timm/models/inception_v4.py
deleted file mode 100644
index f6c8f17..0000000
--- a/AVLFormer/src/timm/models/inception_v4.py
+++ /dev/null
@@ -1,316 +0,0 @@
-""" Pytorch Inception-V4 implementation
-Sourced from https://github.com/Cadene/tensorflow-model-zoo.torch (MIT License) which is
-based upon Google's Tensorflow implementation and pretrained weights (Apache 2.0 License)
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['InceptionV4']
-
-default_cfgs = {
-    'inception_v4': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/inceptionv4-8e4777a0.pth',
-        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'features.0.conv', 'classifier': 'last_linear',
-        'label_offset': 1,  # 1001 classes in pretrained weights
-    }
-}
-
-
-class BasicConv2d(nn.Module):
-    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
-        super(BasicConv2d, self).__init__()
-        self.conv = nn.Conv2d(
-            in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
-        self.bn = nn.BatchNorm2d(out_planes, eps=0.001)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = self.relu(x)
-        return x
-
-
-class Mixed3a(nn.Module):
-    def __init__(self):
-        super(Mixed3a, self).__init__()
-        self.maxpool = nn.MaxPool2d(3, stride=2)
-        self.conv = BasicConv2d(64, 96, kernel_size=3, stride=2)
-
-    def forward(self, x):
-        x0 = self.maxpool(x)
-        x1 = self.conv(x)
-        out = torch.cat((x0, x1), 1)
-        return out
-
-
-class Mixed4a(nn.Module):
-    def __init__(self):
-        super(Mixed4a, self).__init__()
-
-        self.branch0 = nn.Sequential(
-            BasicConv2d(160, 64, kernel_size=1, stride=1),
-            BasicConv2d(64, 96, kernel_size=3, stride=1)
-        )
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(160, 64, kernel_size=1, stride=1),
-            BasicConv2d(64, 64, kernel_size=(1, 7), stride=1, padding=(0, 3)),
-            BasicConv2d(64, 64, kernel_size=(7, 1), stride=1, padding=(3, 0)),
-            BasicConv2d(64, 96, kernel_size=(3, 3), stride=1)
-        )
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        out = torch.cat((x0, x1), 1)
-        return out
-
-
-class Mixed5a(nn.Module):
-    def __init__(self):
-        super(Mixed5a, self).__init__()
-        self.conv = BasicConv2d(192, 192, kernel_size=3, stride=2)
-        self.maxpool = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x0 = self.conv(x)
-        x1 = self.maxpool(x)
-        out = torch.cat((x0, x1), 1)
-        return out
-
-
-class InceptionA(nn.Module):
-    def __init__(self):
-        super(InceptionA, self).__init__()
-        self.branch0 = BasicConv2d(384, 96, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(384, 64, kernel_size=1, stride=1),
-            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1)
-        )
-
-        self.branch2 = nn.Sequential(
-            BasicConv2d(384, 64, kernel_size=1, stride=1),
-            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
-        )
-
-        self.branch3 = nn.Sequential(
-            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
-            BasicConv2d(384, 96, kernel_size=1, stride=1)
-        )
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        x3 = self.branch3(x)
-        out = torch.cat((x0, x1, x2, x3), 1)
-        return out
-
-
-class ReductionA(nn.Module):
-    def __init__(self):
-        super(ReductionA, self).__init__()
-        self.branch0 = BasicConv2d(384, 384, kernel_size=3, stride=2)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(384, 192, kernel_size=1, stride=1),
-            BasicConv2d(192, 224, kernel_size=3, stride=1, padding=1),
-            BasicConv2d(224, 256, kernel_size=3, stride=2)
-        )
-
-        self.branch2 = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        out = torch.cat((x0, x1, x2), 1)
-        return out
-
-
-class InceptionB(nn.Module):
-    def __init__(self):
-        super(InceptionB, self).__init__()
-        self.branch0 = BasicConv2d(1024, 384, kernel_size=1, stride=1)
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(1024, 192, kernel_size=1, stride=1),
-            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)),
-            BasicConv2d(224, 256, kernel_size=(7, 1), stride=1, padding=(3, 0))
-        )
-
-        self.branch2 = nn.Sequential(
-            BasicConv2d(1024, 192, kernel_size=1, stride=1),
-            BasicConv2d(192, 192, kernel_size=(7, 1), stride=1, padding=(3, 0)),
-            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)),
-            BasicConv2d(224, 224, kernel_size=(7, 1), stride=1, padding=(3, 0)),
-            BasicConv2d(224, 256, kernel_size=(1, 7), stride=1, padding=(0, 3))
-        )
-
-        self.branch3 = nn.Sequential(
-            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
-            BasicConv2d(1024, 128, kernel_size=1, stride=1)
-        )
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        x3 = self.branch3(x)
-        out = torch.cat((x0, x1, x2, x3), 1)
-        return out
-
-
-class ReductionB(nn.Module):
-    def __init__(self):
-        super(ReductionB, self).__init__()
-
-        self.branch0 = nn.Sequential(
-            BasicConv2d(1024, 192, kernel_size=1, stride=1),
-            BasicConv2d(192, 192, kernel_size=3, stride=2)
-        )
-
-        self.branch1 = nn.Sequential(
-            BasicConv2d(1024, 256, kernel_size=1, stride=1),
-            BasicConv2d(256, 256, kernel_size=(1, 7), stride=1, padding=(0, 3)),
-            BasicConv2d(256, 320, kernel_size=(7, 1), stride=1, padding=(3, 0)),
-            BasicConv2d(320, 320, kernel_size=3, stride=2)
-        )
-
-        self.branch2 = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-        x1 = self.branch1(x)
-        x2 = self.branch2(x)
-        out = torch.cat((x0, x1, x2), 1)
-        return out
-
-
-class InceptionC(nn.Module):
-    def __init__(self):
-        super(InceptionC, self).__init__()
-
-        self.branch0 = BasicConv2d(1536, 256, kernel_size=1, stride=1)
-
-        self.branch1_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
-        self.branch1_1a = BasicConv2d(384, 256, kernel_size=(1, 3), stride=1, padding=(0, 1))
-        self.branch1_1b = BasicConv2d(384, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))
-
-        self.branch2_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
-        self.branch2_1 = BasicConv2d(384, 448, kernel_size=(3, 1), stride=1, padding=(1, 0))
-        self.branch2_2 = BasicConv2d(448, 512, kernel_size=(1, 3), stride=1, padding=(0, 1))
-        self.branch2_3a = BasicConv2d(512, 256, kernel_size=(1, 3), stride=1, padding=(0, 1))
-        self.branch2_3b = BasicConv2d(512, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))
-
-        self.branch3 = nn.Sequential(
-            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
-            BasicConv2d(1536, 256, kernel_size=1, stride=1)
-        )
-
-    def forward(self, x):
-        x0 = self.branch0(x)
-
-        x1_0 = self.branch1_0(x)
-        x1_1a = self.branch1_1a(x1_0)
-        x1_1b = self.branch1_1b(x1_0)
-        x1 = torch.cat((x1_1a, x1_1b), 1)
-
-        x2_0 = self.branch2_0(x)
-        x2_1 = self.branch2_1(x2_0)
-        x2_2 = self.branch2_2(x2_1)
-        x2_3a = self.branch2_3a(x2_2)
-        x2_3b = self.branch2_3b(x2_2)
-        x2 = torch.cat((x2_3a, x2_3b), 1)
-
-        x3 = self.branch3(x)
-
-        out = torch.cat((x0, x1, x2, x3), 1)
-        return out
-
-
-class InceptionV4(nn.Module):
-    def __init__(self, num_classes=1000, in_chans=3, output_stride=32, drop_rate=0., global_pool='avg'):
-        super(InceptionV4, self).__init__()
-        assert output_stride == 32
-        self.drop_rate = drop_rate
-        self.num_classes = num_classes
-        self.num_features = 1536
-
-        self.features = nn.Sequential(
-            BasicConv2d(in_chans, 32, kernel_size=3, stride=2),
-            BasicConv2d(32, 32, kernel_size=3, stride=1),
-            BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1),
-            Mixed3a(),
-            Mixed4a(),
-            Mixed5a(),
-            InceptionA(),
-            InceptionA(),
-            InceptionA(),
-            InceptionA(),
-            ReductionA(),  # Mixed6a
-            InceptionB(),
-            InceptionB(),
-            InceptionB(),
-            InceptionB(),
-            InceptionB(),
-            InceptionB(),
-            InceptionB(),
-            ReductionB(),  # Mixed7a
-            InceptionC(),
-            InceptionC(),
-            InceptionC(),
-        )
-        self.feature_info = [
-            dict(num_chs=64, reduction=2, module='features.2'),
-            dict(num_chs=160, reduction=4, module='features.3'),
-            dict(num_chs=384, reduction=8, module='features.9'),
-            dict(num_chs=1024, reduction=16, module='features.17'),
-            dict(num_chs=1536, reduction=32, module='features.21'),
-        ]
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def get_classifier(self):
-        return self.last_linear
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        return self.features(x)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.last_linear(x)
-        return x
-
-
-def _create_inception_v4(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        InceptionV4, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def inception_v4(pretrained=False, **kwargs):
-    return _create_inception_v4('inception_v4', pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/layers/__init__.py b/AVLFormer/src/timm/models/layers/__init__.py
deleted file mode 100644
index ac0b6b4..0000000
--- a/AVLFormer/src/timm/models/layers/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from .activations import *
-from .adaptive_avgmax_pool import \
-    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
-from .anti_aliasing import AntiAliasDownsampleLayer
-from .blur_pool import BlurPool2d
-from .classifier import ClassifierHead, create_classifier
-from .cond_conv2d import CondConv2d, get_condconv_initializer
-from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\
-    set_layer_config
-from .conv2d_same import Conv2dSame, conv2d_same
-from .conv_bn_act import ConvBnAct
-from .create_act import create_act_layer, get_act_layer, get_act_fn
-from .create_attn import get_attn, create_attn
-from .create_conv2d import create_conv2d
-from .create_norm_act import get_norm_act_layer, create_norm_act, convert_norm_act
-from .create_self_attn import get_self_attn, create_self_attn
-from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
-from .eca import EcaModule, CecaModule
-from .evo_norm import EvoNormBatch2d, EvoNormSample2d
-from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible
-from .inplace_abn import InplaceAbn
-from .linear import Linear
-from .mixed_conv2d import MixedConv2d
-from .norm import GroupNorm
-from .norm_act import BatchNormAct2d, GroupNormAct
-from .padding import get_padding, get_same_padding, pad_same
-from .pool2d_same import AvgPool2dSame, create_pool2d
-from .se import SEModule
-from .selective_kernel import SelectiveKernelConv
-from .separable_conv import SeparableConv2d, SeparableConvBnAct
-from .space_to_depth import SpaceToDepthModule
-from .split_attn import SplitAttnConv2d
-from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
-from .std_conv import StdConv2d, StdConv2dSame, ScaledStdConv2d, ScaledStdConv2dSame
-from .test_time_pool import TestTimePoolHead, apply_test_time_pool
-from .weight_init import trunc_normal_, variance_scaling_, lecun_normal_
diff --git a/AVLFormer/src/timm/models/layers/activations.py b/AVLFormer/src/timm/models/layers/activations.py
deleted file mode 100644
index e16b3bd..0000000
--- a/AVLFormer/src/timm/models/layers/activations.py
+++ /dev/null
@@ -1,145 +0,0 @@
-""" Activations
-
-A collection of activations fn and modules with a common interface so that they can
-easily be swapped. All have an `inplace` arg even if not used.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-def swish(x, inplace: bool = False):
-    """Swish - Described in: https://arxiv.org/abs/1710.05941
-    """
-    return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
-
-
-class Swish(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(Swish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return swish(x, self.inplace)
-
-
-def mish(x, inplace: bool = False):
-    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-    NOTE: I don't have a working inplace variant
-    """
-    return x.mul(F.softplus(x).tanh())
-
-
-class Mish(nn.Module):
-    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-    """
-    def __init__(self, inplace: bool = False):
-        super(Mish, self).__init__()
-
-    def forward(self, x):
-        return mish(x)
-
-
-def sigmoid(x, inplace: bool = False):
-    return x.sigmoid_() if inplace else x.sigmoid()
-
-
-# PyTorch has this, but not with a consistent inplace argmument interface
-class Sigmoid(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(Sigmoid, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return x.sigmoid_() if self.inplace else x.sigmoid()
-
-
-def tanh(x, inplace: bool = False):
-    return x.tanh_() if inplace else x.tanh()
-
-
-# PyTorch has this, but not with a consistent inplace argmument interface
-class Tanh(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(Tanh, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return x.tanh_() if self.inplace else x.tanh()
-
-
-def hard_swish(x, inplace: bool = False):
-    inner = F.relu6(x + 3.).div_(6.)
-    return x.mul_(inner) if inplace else x.mul(inner)
-
-
-class HardSwish(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSwish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return hard_swish(x, self.inplace)
-
-
-def hard_sigmoid(x, inplace: bool = False):
-    if inplace:
-        return x.add_(3.).clamp_(0., 6.).div_(6.)
-    else:
-        return F.relu6(x + 3.) / 6.
-
-
-class HardSigmoid(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSigmoid, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return hard_sigmoid(x, self.inplace)
-
-
-def hard_mish(x, inplace: bool = False):
-    """ Hard Mish
-    Experimental, based on notes by Mish author Diganta Misra at
-      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
-    """
-    if inplace:
-        return x.mul_(0.5 * (x + 2).clamp(min=0, max=2))
-    else:
-        return 0.5 * x * (x + 2).clamp(min=0, max=2)
-
-
-class HardMish(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardMish, self).__init__()
-        self.inplace = inplace
-
-    def forward(self, x):
-        return hard_mish(x, self.inplace)
-
-
-class PReLU(nn.PReLU):
-    """Applies PReLU (w/ dummy inplace arg)
-    """
-    def __init__(self, num_parameters: int = 1, init: float = 0.25, inplace: bool = False) -> None:
-        super(PReLU, self).__init__(num_parameters=num_parameters, init=init)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.prelu(input, self.weight)
-
-
-def gelu(x: torch.Tensor, inplace: bool = False) -> torch.Tensor:
-    return F.gelu(x)
-
-
-class GELU(nn.Module):
-    """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
-    """
-    def __init__(self, inplace: bool = False):
-        super(GELU, self).__init__()
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.gelu(input)
diff --git a/AVLFormer/src/timm/models/layers/activations_jit.py b/AVLFormer/src/timm/models/layers/activations_jit.py
deleted file mode 100644
index b4a5165..0000000
--- a/AVLFormer/src/timm/models/layers/activations_jit.py
+++ /dev/null
@@ -1,90 +0,0 @@
-""" Activations
-
-A collection of jit-scripted activations fn and modules with a common interface so that they can
-easily be swapped. All have an `inplace` arg even if not used.
-
-All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
-currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
-versions if they contain in-place ops.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-@torch.jit.script
-def swish_jit(x, inplace: bool = False):
-    """Swish - Described in: https://arxiv.org/abs/1710.05941
-    """
-    return x.mul(x.sigmoid())
-
-
-@torch.jit.script
-def mish_jit(x, _inplace: bool = False):
-    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-    """
-    return x.mul(F.softplus(x).tanh())
-
-
-class SwishJit(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(SwishJit, self).__init__()
-
-    def forward(self, x):
-        return swish_jit(x)
-
-
-class MishJit(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(MishJit, self).__init__()
-
-    def forward(self, x):
-        return mish_jit(x)
-
-
-@torch.jit.script
-def hard_sigmoid_jit(x, inplace: bool = False):
-    # return F.relu6(x + 3.) / 6.
-    return (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
-
-
-class HardSigmoidJit(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSigmoidJit, self).__init__()
-
-    def forward(self, x):
-        return hard_sigmoid_jit(x)
-
-
-@torch.jit.script
-def hard_swish_jit(x, inplace: bool = False):
-    # return x * (F.relu6(x + 3.) / 6)
-    return x * (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
-
-
-class HardSwishJit(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSwishJit, self).__init__()
-
-    def forward(self, x):
-        return hard_swish_jit(x)
-
-
-@torch.jit.script
-def hard_mish_jit(x, inplace: bool = False):
-    """ Hard Mish
-    Experimental, based on notes by Mish author Diganta Misra at
-      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
-    """
-    return 0.5 * x * (x + 2).clamp(min=0, max=2)
-
-
-class HardMishJit(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardMishJit, self).__init__()
-
-    def forward(self, x):
-        return hard_mish_jit(x)
diff --git a/AVLFormer/src/timm/models/layers/activations_me.py b/AVLFormer/src/timm/models/layers/activations_me.py
deleted file mode 100644
index 9a12bb7..0000000
--- a/AVLFormer/src/timm/models/layers/activations_me.py
+++ /dev/null
@@ -1,218 +0,0 @@
-""" Activations (memory-efficient w/ custom autograd)
-
-A collection of activations fn and modules with a common interface so that they can
-easily be swapped. All have an `inplace` arg even if not used.
-
-These activations are not compatible with jit scripting or ONNX export of the model, please use either
-the JIT or basic versions of the activations.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-@torch.jit.script
-def swish_jit_fwd(x):
-    return x.mul(torch.sigmoid(x))
-
-
-@torch.jit.script
-def swish_jit_bwd(x, grad_output):
-    x_sigmoid = torch.sigmoid(x)
-    return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
-
-
-class SwishJitAutoFn(torch.autograd.Function):
-    """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
-    Inspired by conversation btw Jeremy Howard & Adam Pazske
-    https://twitter.com/jeremyphoward/status/1188251041835315200
-    """
-    @staticmethod
-    def symbolic(g, x):
-        return g.op("Mul", x, g.op("Sigmoid", x))
-
-    @staticmethod
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return swish_jit_fwd(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        x = ctx.saved_tensors[0]
-        return swish_jit_bwd(x, grad_output)
-
-
-def swish_me(x, inplace=False):
-    return SwishJitAutoFn.apply(x)
-
-
-class SwishMe(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(SwishMe, self).__init__()
-
-    def forward(self, x):
-        return SwishJitAutoFn.apply(x)
-
-
-@torch.jit.script
-def mish_jit_fwd(x):
-    return x.mul(torch.tanh(F.softplus(x)))
-
-
-@torch.jit.script
-def mish_jit_bwd(x, grad_output):
-    x_sigmoid = torch.sigmoid(x)
-    x_tanh_sp = F.softplus(x).tanh()
-    return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
-
-
-class MishJitAutoFn(torch.autograd.Function):
-    """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-    A memory efficient, jit scripted variant of Mish
-    """
-    @staticmethod
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return mish_jit_fwd(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        x = ctx.saved_tensors[0]
-        return mish_jit_bwd(x, grad_output)
-
-
-def mish_me(x, inplace=False):
-    return MishJitAutoFn.apply(x)
-
-
-class MishMe(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(MishMe, self).__init__()
-
-    def forward(self, x):
-        return MishJitAutoFn.apply(x)
-
-
-@torch.jit.script
-def hard_sigmoid_jit_fwd(x, inplace: bool = False):
-    return (x + 3).clamp(min=0, max=6).div(6.)
-
-
-@torch.jit.script
-def hard_sigmoid_jit_bwd(x, grad_output):
-    m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
-    return grad_output * m
-
-
-class HardSigmoidJitAutoFn(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return hard_sigmoid_jit_fwd(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        x = ctx.saved_tensors[0]
-        return hard_sigmoid_jit_bwd(x, grad_output)
-
-
-def hard_sigmoid_me(x, inplace: bool = False):
-    return HardSigmoidJitAutoFn.apply(x)
-
-
-class HardSigmoidMe(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSigmoidMe, self).__init__()
-
-    def forward(self, x):
-        return HardSigmoidJitAutoFn.apply(x)
-
-
-@torch.jit.script
-def hard_swish_jit_fwd(x):
-    return x * (x + 3).clamp(min=0, max=6).div(6.)
-
-
-@torch.jit.script
-def hard_swish_jit_bwd(x, grad_output):
-    m = torch.ones_like(x) * (x >= 3.)
-    m = torch.where((x >= -3.) & (x <= 3.),  x / 3. + .5, m)
-    return grad_output * m
-
-
-class HardSwishJitAutoFn(torch.autograd.Function):
-    """A memory efficient, jit-scripted HardSwish activation"""
-    @staticmethod
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return hard_swish_jit_fwd(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        x = ctx.saved_tensors[0]
-        return hard_swish_jit_bwd(x, grad_output)
-
-    @staticmethod
-    def symbolic(g, self):
-        input = g.op("Add", self, g.op('Constant', value_t=torch.tensor(3, dtype=torch.float)))
-        hardtanh_ = g.op("Clip", input, g.op('Constant', value_t=torch.tensor(0, dtype=torch.float)), g.op('Constant', value_t=torch.tensor(6, dtype=torch.float)))
-        hardtanh_ = g.op("Div", hardtanh_, g.op('Constant', value_t=torch.tensor(6, dtype=torch.float)))
-        return g.op("Mul", self, hardtanh_)
-
-
-def hard_swish_me(x, inplace=False):
-    return HardSwishJitAutoFn.apply(x)
-
-
-class HardSwishMe(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardSwishMe, self).__init__()
-
-    def forward(self, x):
-        return HardSwishJitAutoFn.apply(x)
-
-
-@torch.jit.script
-def hard_mish_jit_fwd(x):
-    return 0.5 * x * (x + 2).clamp(min=0, max=2)
-
-
-@torch.jit.script
-def hard_mish_jit_bwd(x, grad_output):
-    m = torch.ones_like(x) * (x >= -2.)
-    m = torch.where((x >= -2.) & (x <= 0.), x + 1., m)
-    return grad_output * m
-
-
-class HardMishJitAutoFn(torch.autograd.Function):
-    """ A memory efficient, jit scripted variant of Hard Mish
-    Experimental, based on notes by Mish author Diganta Misra at
-      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
-    """
-    @staticmethod
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return hard_mish_jit_fwd(x)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        x = ctx.saved_tensors[0]
-        return hard_mish_jit_bwd(x, grad_output)
-
-
-def hard_mish_me(x, inplace: bool = False):
-    return HardMishJitAutoFn.apply(x)
-
-
-class HardMishMe(nn.Module):
-    def __init__(self, inplace: bool = False):
-        super(HardMishMe, self).__init__()
-
-    def forward(self, x):
-        return HardMishJitAutoFn.apply(x)
-
-
-
diff --git a/AVLFormer/src/timm/models/layers/adaptive_avgmax_pool.py b/AVLFormer/src/timm/models/layers/adaptive_avgmax_pool.py
deleted file mode 100644
index d2bb9f7..0000000
--- a/AVLFormer/src/timm/models/layers/adaptive_avgmax_pool.py
+++ /dev/null
@@ -1,119 +0,0 @@
-""" PyTorch selectable adaptive pooling
-Adaptive pooling with the ability to select the type of pooling from:
-    * 'avg' - Average pooling
-    * 'max' - Max pooling
-    * 'avgmax' - Sum of average and max pooling re-scaled by 0.5
-    * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim
-
-Both a functional and a nn.Module version of the pooling is provided.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def adaptive_pool_feat_mult(pool_type='avg'):
-    if pool_type == 'catavgmax':
-        return 2
-    else:
-        return 1
-
-
-def adaptive_avgmax_pool2d(x, output_size=1):
-    x_avg = F.adaptive_avg_pool2d(x, output_size)
-    x_max = F.adaptive_max_pool2d(x, output_size)
-    return 0.5 * (x_avg + x_max)
-
-
-def adaptive_catavgmax_pool2d(x, output_size=1):
-    x_avg = F.adaptive_avg_pool2d(x, output_size)
-    x_max = F.adaptive_max_pool2d(x, output_size)
-    return torch.cat((x_avg, x_max), 1)
-
-
-def select_adaptive_pool2d(x, pool_type='avg', output_size=1):
-    """Selectable global pooling function with dynamic input kernel size
-    """
-    if pool_type == 'avg':
-        x = F.adaptive_avg_pool2d(x, output_size)
-    elif pool_type == 'avgmax':
-        x = adaptive_avgmax_pool2d(x, output_size)
-    elif pool_type == 'catavgmax':
-        x = adaptive_catavgmax_pool2d(x, output_size)
-    elif pool_type == 'max':
-        x = F.adaptive_max_pool2d(x, output_size)
-    else:
-        assert False, 'Invalid pool type: %s' % pool_type
-    return x
-
-
-class FastAdaptiveAvgPool2d(nn.Module):
-    def __init__(self, flatten=False):
-        super(FastAdaptiveAvgPool2d, self).__init__()
-        self.flatten = flatten
-
-    def forward(self, x):
-        return x.mean((2, 3)) if self.flatten else x.mean((2, 3), keepdim=True)
-
-
-class AdaptiveAvgMaxPool2d(nn.Module):
-    def __init__(self, output_size=1):
-        super(AdaptiveAvgMaxPool2d, self).__init__()
-        self.output_size = output_size
-
-    def forward(self, x):
-        return adaptive_avgmax_pool2d(x, self.output_size)
-
-
-class AdaptiveCatAvgMaxPool2d(nn.Module):
-    def __init__(self, output_size=1):
-        super(AdaptiveCatAvgMaxPool2d, self).__init__()
-        self.output_size = output_size
-
-    def forward(self, x):
-        return adaptive_catavgmax_pool2d(x, self.output_size)
-
-
-class SelectAdaptivePool2d(nn.Module):
-    """Selectable global pooling layer with dynamic input kernel size
-    """
-    def __init__(self, output_size=1, pool_type='fast', flatten=False):
-        super(SelectAdaptivePool2d, self).__init__()
-        self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
-        self.flatten = flatten
-        if pool_type == '':
-            self.pool = nn.Identity()  # pass through
-        elif pool_type == 'fast':
-            assert output_size == 1
-            self.pool = FastAdaptiveAvgPool2d(self.flatten)
-            self.flatten = False
-        elif pool_type == 'avg':
-            self.pool = nn.AdaptiveAvgPool2d(output_size)
-        elif pool_type == 'avgmax':
-            self.pool = AdaptiveAvgMaxPool2d(output_size)
-        elif pool_type == 'catavgmax':
-            self.pool = AdaptiveCatAvgMaxPool2d(output_size)
-        elif pool_type == 'max':
-            self.pool = nn.AdaptiveMaxPool2d(output_size)
-        else:
-            assert False, 'Invalid pool type: %s' % pool_type
-
-    def is_identity(self):
-        return self.pool_type == ''
-
-    def forward(self, x):
-        x = self.pool(x)
-        if self.flatten:
-            x = x.flatten(1)
-        return x
-
-    def feat_mult(self):
-        return adaptive_pool_feat_mult(self.pool_type)
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-               + 'pool_type=' + self.pool_type \
-               + ', flatten=' + str(self.flatten) + ')'
-
diff --git a/AVLFormer/src/timm/models/layers/anti_aliasing.py b/AVLFormer/src/timm/models/layers/anti_aliasing.py
deleted file mode 100644
index 9d3837e..0000000
--- a/AVLFormer/src/timm/models/layers/anti_aliasing.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import torch
-import torch.nn.parallel
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class AntiAliasDownsampleLayer(nn.Module):
-    def __init__(self, channels: int = 0, filt_size: int = 3, stride: int = 2, no_jit: bool = False):
-        super(AntiAliasDownsampleLayer, self).__init__()
-        if no_jit:
-            self.op = Downsample(channels, filt_size, stride)
-        else:
-            self.op = DownsampleJIT(channels, filt_size, stride)
-
-        # FIXME I should probably override _apply and clear DownsampleJIT filter cache for .cuda(), .half(), etc calls
-
-    def forward(self, x):
-        return self.op(x)
-
-
-@torch.jit.script
-class DownsampleJIT(object):
-    def __init__(self, channels: int = 0, filt_size: int = 3, stride: int = 2):
-        self.channels = channels
-        self.stride = stride
-        self.filt_size = filt_size
-        assert self.filt_size == 3
-        assert stride == 2
-        self.filt = {}  # lazy init by device for DataParallel compat
-
-    def _create_filter(self, like: torch.Tensor):
-        filt = torch.tensor([1., 2., 1.], dtype=like.dtype, device=like.device)
-        filt = filt[:, None] * filt[None, :]
-        filt = filt / torch.sum(filt)
-        return filt[None, None, :, :].repeat((self.channels, 1, 1, 1))
-
-    def __call__(self, input: torch.Tensor):
-        input_pad = F.pad(input, (1, 1, 1, 1), 'reflect')
-        filt = self.filt.get(str(input.device), self._create_filter(input))
-        return F.conv2d(input_pad, filt, stride=2, padding=0, groups=input.shape[1])
-
-
-class Downsample(nn.Module):
-    def __init__(self, channels=None, filt_size=3, stride=2):
-        super(Downsample, self).__init__()
-        self.channels = channels
-        self.filt_size = filt_size
-        self.stride = stride
-
-        assert self.filt_size == 3
-        filt = torch.tensor([1., 2., 1.])
-        filt = filt[:, None] * filt[None, :]
-        filt = filt / torch.sum(filt)
-
-        # self.filt = filt[None, None, :, :].repeat((self.channels, 1, 1, 1))
-        self.register_buffer('filt', filt[None, None, :, :].repeat((self.channels, 1, 1, 1)))
-
-    def forward(self, input):
-        input_pad = F.pad(input, (1, 1, 1, 1), 'reflect')
-        return F.conv2d(input_pad, self.filt, stride=self.stride, padding=0, groups=input.shape[1])
diff --git a/AVLFormer/src/timm/models/layers/blur_pool.py b/AVLFormer/src/timm/models/layers/blur_pool.py
deleted file mode 100644
index 399cbe3..0000000
--- a/AVLFormer/src/timm/models/layers/blur_pool.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-BlurPool layer inspired by
- - Kornia's Max_BlurPool2d
- - Making Convolutional Networks Shift-Invariant Again :cite:`zhang2019shiftinvar`
-
-FIXME merge this impl with those in `anti_aliasing.py`
-
-Hacked together by Chris Ha and Ross Wightman
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from typing import Dict
-from .padding import get_padding
-
-
-class BlurPool2d(nn.Module):
-    r"""Creates a module that computes blurs and downsample a given feature map.
-    See :cite:`zhang2019shiftinvar` for more details.
-    Corresponds to the Downsample class, which does blurring and subsampling
-
-    Args:
-        channels = Number of input channels
-        filt_size (int): binomial filter size for blurring. currently supports 3 (default) and 5.
-        stride (int): downsampling filter stride
-
-    Returns:
-        torch.Tensor: the transformed tensor.
-    """
-    filt: Dict[str, torch.Tensor]
-
-    def __init__(self, channels, filt_size=3, stride=2) -> None:
-        super(BlurPool2d, self).__init__()
-        assert filt_size > 1
-        self.channels = channels
-        self.filt_size = filt_size
-        self.stride = stride
-        pad_size = [get_padding(filt_size, stride, dilation=1)] * 4
-        self.padding = nn.ReflectionPad2d(pad_size)
-        self._coeffs = torch.tensor((np.poly1d((0.5, 0.5)) ** (self.filt_size - 1)).coeffs)  # for torchscript compat
-        self.filt = {}  # lazy init by device for DataParallel compat
-
-    def _create_filter(self, like: torch.Tensor):
-        blur_filter = (self._coeffs[:, None] * self._coeffs[None, :]).to(dtype=like.dtype, device=like.device)
-        return blur_filter[None, None, :, :].repeat(self.channels, 1, 1, 1)
-
-    def _apply(self, fn):
-        # override nn.Module _apply, reset filter cache if used
-        self.filt = {}
-        super(BlurPool2d, self)._apply(fn)
-
-    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
-        C = input_tensor.shape[1]
-        blur_filt = self.filt.get(str(input_tensor.device), self._create_filter(input_tensor))
-        return F.conv2d(
-            self.padding(input_tensor), blur_filt, stride=self.stride, groups=C)
diff --git a/AVLFormer/src/timm/models/layers/bottleneck_attn.py b/AVLFormer/src/timm/models/layers/bottleneck_attn.py
deleted file mode 100644
index 0bb0e27..0000000
--- a/AVLFormer/src/timm/models/layers/bottleneck_attn.py
+++ /dev/null
@@ -1,120 +0,0 @@
-""" Bottleneck Self Attention (Bottleneck Transformers)
-
-Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605
-
-@misc{2101.11605,
-Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
-Title = {Bottleneck Transformers for Visual Recognition},
-Year = {2021},
-}
-
-Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-
-This impl is a WIP but given that it is based on the ref gist likely not too far off.
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers import to_2tuple
-
-
-def rel_logits_1d(q, rel_k, permute_mask: List[int]):
-    """ Compute relative logits along one dimension
-
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    Args:
-        q: (batch, heads, height, width, dim)
-        rel_k: (2 * width - 1, dim)
-        permute_mask: permute output dim according to this
-    """
-    B, H, W, dim = q.shape
-    x = (q @ rel_k.transpose(-1, -2))
-    x = x.reshape(-1, W, 2 * W -1)
-
-    # pad to shift from relative to absolute indexing
-    x_pad = F.pad(x, [0, 1]).flatten(1)
-    x_pad = F.pad(x_pad, [0, W - 1])
-
-    # reshape and slice out the padded elements
-    x_pad = x_pad.reshape(-1, W + 1, 2 * W - 1)
-    x = x_pad[:, :W, W - 1:]
-
-    # reshape and tile
-    x = x.reshape(B, H, 1, W, W).expand(-1, -1, H, -1, -1)
-    return x.permute(permute_mask)
-
-
-class PosEmbedRel(nn.Module):
-    """ Relative Position Embedding
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-    """
-    def __init__(self, feat_size, dim_head, scale):
-        super().__init__()
-        self.height, self.width = to_2tuple(feat_size)
-        self.dim_head = dim_head
-        self.scale = scale
-        self.height_rel = nn.Parameter(torch.randn(self.height * 2 - 1, dim_head) * self.scale)
-        self.width_rel = nn.Parameter(torch.randn(self.width * 2 - 1, dim_head) * self.scale)
-
-    def forward(self, q):
-        B, num_heads, HW, _ = q.shape
-
-        # relative logits in width dimension.
-        q = q.reshape(B * num_heads, self.height, self.width, -1)
-        rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4))
-
-        # relative logits in height dimension.
-        q = q.transpose(1, 2)
-        rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2))
-
-        rel_logits = rel_logits_h + rel_logits_w
-        rel_logits = rel_logits.reshape(B, num_heads, HW, HW)
-        return rel_logits
-
-
-class BottleneckAttn(nn.Module):
-    """ Bottleneck Attention
-    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605
-    """
-    def __init__(self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, qkv_bias=False):
-        super().__init__()
-        assert feat_size is not None, 'A concrete feature size matching expected input (H, W) is required'
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0
-        self.num_heads = num_heads
-        self.dim_out = dim_out
-        self.dim_head = dim_out // num_heads
-        self.scale = self.dim_head ** -0.5
-
-        self.qkv = nn.Conv2d(dim, self.dim_out * 3, 1, bias=qkv_bias)
-
-        # NOTE I'm only supporting relative pos embedding for now
-        self.pos_embed = PosEmbedRel(feat_size, dim_head=self.dim_head, scale=self.scale)
-
-        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert H == self.pos_embed.height and W == self.pos_embed.width
-
-        x = self.qkv(x)  # B, 3 * num_heads * dim_head, H, W
-        x = x.reshape(B, -1, self.dim_head, H * W).transpose(-1, -2)
-        q, k, v = torch.split(x, self.num_heads, dim=1)
-
-        attn_logits = (q @ k.transpose(-1, -2)) * self.scale
-        attn_logits = attn_logits + self.pos_embed(q)  # B, num_heads, H * W, H * W
-
-        attn_out = attn_logits.softmax(dim = -1)
-        attn_out = (attn_out @ v).transpose(1, 2).reshape(B, self.dim_out, H, W) # B, dim_out, H, W
-        attn_out = self.pool(attn_out)
-        return attn_out
-
-
diff --git a/AVLFormer/src/timm/models/layers/cbam.py b/AVLFormer/src/timm/models/layers/cbam.py
deleted file mode 100644
index 44e2fe6..0000000
--- a/AVLFormer/src/timm/models/layers/cbam.py
+++ /dev/null
@@ -1,99 +0,0 @@
-""" CBAM (sort-of) Attention
-
-Experimental impl of CBAM: Convolutional Block Attention Module: https://arxiv.org/abs/1807.06521
-
-WARNING: Results with these attention layers have been mixed. They can significantly reduce performance on
-some tasks, especially fine-grained it seems. I may end up removing this impl.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-from torch import nn as nn
-import torch.nn.functional as F
-from .conv_bn_act import ConvBnAct
-
-
-class ChannelAttn(nn.Module):
-    """ Original CBAM channel attention module, currently avg + max pool variant only.
-    """
-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
-        super(ChannelAttn, self).__init__()
-        self.fc1 = nn.Conv2d(channels, channels // reduction, 1, bias=False)
-        self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(channels // reduction, channels, 1, bias=False)
-
-    def forward(self, x):
-        x_avg = x.mean((2, 3), keepdim=True)
-        x_max = F.adaptive_max_pool2d(x, 1)
-        x_avg = self.fc2(self.act(self.fc1(x_avg)))
-        x_max = self.fc2(self.act(self.fc1(x_max)))
-        x_attn = x_avg + x_max
-        return x * x_attn.sigmoid()
-
-
-class LightChannelAttn(ChannelAttn):
-    """An experimental 'lightweight' that sums avg + max pool first
-    """
-    def __init__(self, channels, reduction=16):
-        super(LightChannelAttn, self).__init__(channels, reduction)
-
-    def forward(self, x):
-        x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * F.adaptive_max_pool2d(x, 1)
-        x_attn = self.fc2(self.act(self.fc1(x_pool)))
-        return x * x_attn.sigmoid()
-
-
-class SpatialAttn(nn.Module):
-    """ Original CBAM spatial attention module
-    """
-    def __init__(self, kernel_size=7):
-        super(SpatialAttn, self).__init__()
-        self.conv = ConvBnAct(2, 1, kernel_size, act_layer=None)
-
-    def forward(self, x):
-        x_avg = torch.mean(x, dim=1, keepdim=True)
-        x_max = torch.max(x, dim=1, keepdim=True)[0]
-        x_attn = torch.cat([x_avg, x_max], dim=1)
-        x_attn = self.conv(x_attn)
-        return x * x_attn.sigmoid()
-
-
-class LightSpatialAttn(nn.Module):
-    """An experimental 'lightweight' variant that sums avg_pool and max_pool results.
-    """
-    def __init__(self, kernel_size=7):
-        super(LightSpatialAttn, self).__init__()
-        self.conv = ConvBnAct(1, 1, kernel_size, act_layer=None)
-
-    def forward(self, x):
-        x_avg = torch.mean(x, dim=1, keepdim=True)
-        x_max = torch.max(x, dim=1, keepdim=True)[0]
-        x_attn = 0.5 * x_avg + 0.5 * x_max
-        x_attn = self.conv(x_attn)
-        return x * x_attn.sigmoid()
-
-
-class CbamModule(nn.Module):
-    def __init__(self, channels, spatial_kernel_size=7):
-        super(CbamModule, self).__init__()
-        self.channel = ChannelAttn(channels)
-        self.spatial = SpatialAttn(spatial_kernel_size)
-
-    def forward(self, x):
-        x = self.channel(x)
-        x = self.spatial(x)
-        return x
-
-
-class LightCbamModule(nn.Module):
-    def __init__(self, channels, spatial_kernel_size=7):
-        super(LightCbamModule, self).__init__()
-        self.channel = LightChannelAttn(channels)
-        self.spatial = LightSpatialAttn(spatial_kernel_size)
-
-    def forward(self, x):
-        x = self.channel(x)
-        x = self.spatial(x)
-        return x
-
diff --git a/AVLFormer/src/timm/models/layers/classifier.py b/AVLFormer/src/timm/models/layers/classifier.py
deleted file mode 100644
index 516cc6c..0000000
--- a/AVLFormer/src/timm/models/layers/classifier.py
+++ /dev/null
@@ -1,55 +0,0 @@
-""" Classifier head and layer factory
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from torch import nn as nn
-from torch.nn import functional as F
-
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
-from .linear import Linear
-
-
-def _create_pool(num_features, num_classes, pool_type='avg', use_conv=False):
-    flatten_in_pool = not use_conv  # flatten when we use a Linear layer after pooling
-    if not pool_type:
-        assert num_classes == 0 or use_conv,\
-            'Pooling can only be disabled if classifier is also removed or conv classifier is used'
-        flatten_in_pool = False  # disable flattening if pooling is pass-through (no pooling)
-    global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten_in_pool)
-    num_pooled_features = num_features * global_pool.feat_mult()
-    return global_pool, num_pooled_features
-
-
-def _create_fc(num_features, num_classes, pool_type='avg', use_conv=False):
-    if num_classes <= 0:
-        fc = nn.Identity()  # pass-through (no classifier)
-    elif use_conv:
-        fc = nn.Conv2d(num_features, num_classes, 1, bias=True)
-    else:
-        # NOTE: using my Linear wrapper that fixes AMP + torchscript casting issue
-        fc = Linear(num_features, num_classes, bias=True)
-    return fc
-
-
-def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False):
-    global_pool, num_pooled_features = _create_pool(num_features, num_classes, pool_type, use_conv=use_conv)
-    fc = _create_fc(num_pooled_features, num_classes, use_conv=use_conv)
-    return global_pool, fc
-
-
-class ClassifierHead(nn.Module):
-    """Classifier head w/ configurable global pooling and dropout."""
-
-    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0., use_conv=False):
-        super(ClassifierHead, self).__init__()
-        self.drop_rate = drop_rate
-        self.global_pool, num_pooled_features = _create_pool(in_chs, num_classes, pool_type, use_conv=use_conv)
-        self.fc = _create_fc(num_pooled_features, num_classes, use_conv=use_conv)
-        self.flatten_after_fc = use_conv and pool_type
-
-    def forward(self, x):
-        x = self.global_pool(x)
-        if self.drop_rate:
-            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
-        x = self.fc(x)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/cond_conv2d.py b/AVLFormer/src/timm/models/layers/cond_conv2d.py
deleted file mode 100644
index 8b4bbca..0000000
--- a/AVLFormer/src/timm/models/layers/cond_conv2d.py
+++ /dev/null
@@ -1,122 +0,0 @@
-""" PyTorch Conditionally Parameterized Convolution (CondConv)
-
-Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
-(https://arxiv.org/abs/1904.04971)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import math
-from functools import partial
-import numpy as np
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-from .helpers import to_2tuple
-from .conv2d_same import conv2d_same
-from .padding import get_padding_value
-
-
-def get_condconv_initializer(initializer, num_experts, expert_shape):
-    def condconv_initializer(weight):
-        """CondConv initializer function."""
-        num_params = np.prod(expert_shape)
-        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
-                weight.shape[1] != num_params):
-            raise (ValueError(
-                'CondConv variables must have shape [num_experts, num_params]'))
-        for i in range(num_experts):
-            initializer(weight[i].view(expert_shape))
-    return condconv_initializer
-
-
-class CondConv2d(nn.Module):
-    """ Conditionally Parameterized Convolution
-    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
-
-    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
-    https://github.com/pytorch/pytorch/issues/17983
-    """
-    __constants__ = ['in_channels', 'out_channels', 'dynamic_padding']
-
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
-        super(CondConv2d, self).__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = to_2tuple(kernel_size)
-        self.stride = to_2tuple(stride)
-        padding_val, is_padding_dynamic = get_padding_value(
-            padding, kernel_size, stride=stride, dilation=dilation)
-        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
-        self.padding = to_2tuple(padding_val)
-        self.dilation = to_2tuple(dilation)
-        self.groups = groups
-        self.num_experts = num_experts
-
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight_num_param = 1
-        for wd in self.weight_shape:
-            weight_num_param *= wd
-        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
-
-        if bias:
-            self.bias_shape = (self.out_channels,)
-            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
-        else:
-            self.register_parameter('bias', None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init_weight = get_condconv_initializer(
-            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
-        init_weight(self.weight)
-        if self.bias is not None:
-            fan_in = np.prod(self.weight_shape[1:])
-            bound = 1 / math.sqrt(fan_in)
-            init_bias = get_condconv_initializer(
-                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
-            init_bias(self.bias)
-
-    def forward(self, x, routing_weights):
-        B, C, H, W = x.shape
-        weight = torch.matmul(routing_weights, self.weight)
-        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight = weight.view(new_weight_shape)
-        bias = None
-        if self.bias is not None:
-            bias = torch.matmul(routing_weights, self.bias)
-            bias = bias.view(B * self.out_channels)
-        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
-        x = x.view(1, B * C, H, W)
-        if self.dynamic_padding:
-            out = conv2d_same(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        else:
-            out = F.conv2d(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
-
-        # Literal port (from TF definition)
-        # x = torch.split(x, 1, 0)
-        # weight = torch.split(weight, 1, 0)
-        # if self.bias is not None:
-        #     bias = torch.matmul(routing_weights, self.bias)
-        #     bias = torch.split(bias, 1, 0)
-        # else:
-        #     bias = [None] * B
-        # out = []
-        # for xi, wi, bi in zip(x, weight, bias):
-        #     wi = wi.view(*self.weight_shape)
-        #     if bi is not None:
-        #         bi = bi.view(*self.bias_shape)
-        #     out.append(self.conv_fn(
-        #         xi, wi, bi, stride=self.stride, padding=self.padding,
-        #         dilation=self.dilation, groups=self.groups))
-        # out = torch.cat(out, 0)
-        return out
diff --git a/AVLFormer/src/timm/models/layers/config.py b/AVLFormer/src/timm/models/layers/config.py
deleted file mode 100644
index f07b9d7..0000000
--- a/AVLFormer/src/timm/models/layers/config.py
+++ /dev/null
@@ -1,115 +0,0 @@
-""" Model / Layer Config singleton state
-"""
-from typing import Any, Optional
-
-__all__ = [
-    'is_exportable', 'is_scriptable', 'is_no_jit',
-    'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config'
-]
-
-# Set to True if prefer to have layers with no jit optimization (includes activations)
-_NO_JIT = False
-
-# Set to True if prefer to have activation layers with no jit optimization
-# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
-# the jit flags so far are activations. This will change as more layers are updated and/or added.
-_NO_ACTIVATION_JIT = False
-
-# Set to True if exporting a model with Same padding via ONNX
-_EXPORTABLE = False
-
-# Set to True if wanting to use torch.jit.script on a model
-_SCRIPTABLE = False
-
-
-def is_no_jit():
-    return _NO_JIT
-
-
-class set_no_jit:
-    def __init__(self, mode: bool) -> None:
-        global _NO_JIT
-        self.prev = _NO_JIT
-        _NO_JIT = mode
-
-    def __enter__(self) -> None:
-        pass
-
-    def __exit__(self, *args: Any) -> bool:
-        global _NO_JIT
-        _NO_JIT = self.prev
-        return False
-
-
-def is_exportable():
-    return _EXPORTABLE
-
-
-class set_exportable:
-    def __init__(self, mode: bool) -> None:
-        global _EXPORTABLE
-        self.prev = _EXPORTABLE
-        _EXPORTABLE = mode
-
-    def __enter__(self) -> None:
-        pass
-
-    def __exit__(self, *args: Any) -> bool:
-        global _EXPORTABLE
-        _EXPORTABLE = self.prev
-        return False
-
-
-def is_scriptable():
-    return _SCRIPTABLE
-
-
-class set_scriptable:
-    def __init__(self, mode: bool) -> None:
-        global _SCRIPTABLE
-        self.prev = _SCRIPTABLE
-        _SCRIPTABLE = mode
-
-    def __enter__(self) -> None:
-        pass
-
-    def __exit__(self, *args: Any) -> bool:
-        global _SCRIPTABLE
-        _SCRIPTABLE = self.prev
-        return False
-
-
-class set_layer_config:
-    """ Layer config context manager that allows setting all layer config flags at once.
-    If a flag arg is None, it will not change the current value.
-    """
-    def __init__(
-            self,
-            scriptable: Optional[bool] = None,
-            exportable: Optional[bool] = None,
-            no_jit: Optional[bool] = None,
-            no_activation_jit: Optional[bool] = None):
-        global _SCRIPTABLE
-        global _EXPORTABLE
-        global _NO_JIT
-        global _NO_ACTIVATION_JIT
-        self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
-        if scriptable is not None:
-            _SCRIPTABLE = scriptable
-        if exportable is not None:
-            _EXPORTABLE = exportable
-        if no_jit is not None:
-            _NO_JIT = no_jit
-        if no_activation_jit is not None:
-            _NO_ACTIVATION_JIT = no_activation_jit
-
-    def __enter__(self) -> None:
-        pass
-
-    def __exit__(self, *args: Any) -> bool:
-        global _SCRIPTABLE
-        global _EXPORTABLE
-        global _NO_JIT
-        global _NO_ACTIVATION_JIT
-        _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
-        return False
diff --git a/AVLFormer/src/timm/models/layers/conv2d_same.py b/AVLFormer/src/timm/models/layers/conv2d_same.py
deleted file mode 100644
index 75f0f98..0000000
--- a/AVLFormer/src/timm/models/layers/conv2d_same.py
+++ /dev/null
@@ -1,42 +0,0 @@
-""" Conv2d w/ Same Padding
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Tuple, Optional
-
-from .padding import pad_same, get_padding_value
-
-
-def conv2d_same(
-        x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
-        padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
-    x = pad_same(x, weight.shape[-2:], stride, dilation)
-    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
-
-
-class Conv2dSame(nn.Conv2d):
-    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        super(Conv2dSame, self).__init__(
-            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
-
-    def forward(self, x):
-        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-
-
-def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
-    padding = kwargs.pop('padding', '')
-    kwargs.setdefault('bias', False)
-    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
-    if is_dynamic:
-        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
-    else:
-        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
-
-
diff --git a/AVLFormer/src/timm/models/layers/conv_bn_act.py b/AVLFormer/src/timm/models/layers/conv_bn_act.py
deleted file mode 100644
index 33005c3..0000000
--- a/AVLFormer/src/timm/models/layers/conv_bn_act.py
+++ /dev/null
@@ -1,40 +0,0 @@
-""" Conv2d + BN + Act
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from torch import nn as nn
-
-from .create_conv2d import create_conv2d
-from .create_norm_act import convert_norm_act
-
-
-class ConvBnAct(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
-                 bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, aa_layer=None,
-                 drop_block=None):
-        super(ConvBnAct, self).__init__()
-        use_aa = aa_layer is not None
-
-        self.conv = create_conv2d(
-            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
-            padding=padding, dilation=dilation, groups=groups, bias=bias)
-
-        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
-        norm_act_layer = convert_norm_act(norm_layer, act_layer)
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block)
-        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
-
-    @property
-    def in_channels(self):
-        return self.conv.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv.out_channels
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.aa is not None:
-            x = self.aa(x)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/create_act.py b/AVLFormer/src/timm/models/layers/create_act.py
deleted file mode 100644
index 426c368..0000000
--- a/AVLFormer/src/timm/models/layers/create_act.py
+++ /dev/null
@@ -1,133 +0,0 @@
-""" Activation Factory
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from .activations import *
-from .activations_jit import *
-from .activations_me import *
-from .config import is_exportable, is_scriptable, is_no_jit
-
-# PyTorch has an optimized, native 'silu' (aka 'swish') operator as of PyTorch 1.7. This code
-# will use native version if present. Eventually, the custom Swish layers will be removed
-# and only native 'silu' will be used.
-_has_silu = 'silu' in dir(torch.nn.functional)
-
-_ACT_FN_DEFAULT = dict(
-    silu=F.silu if _has_silu else swish,
-    swish=F.silu if _has_silu else swish,
-    mish=mish,
-    relu=F.relu,
-    relu6=F.relu6,
-    leaky_relu=F.leaky_relu,
-    elu=F.elu,
-    celu=F.celu,
-    selu=F.selu,
-    gelu=gelu,
-    sigmoid=sigmoid,
-    tanh=tanh,
-    hard_sigmoid=hard_sigmoid,
-    hard_swish=hard_swish,
-    hard_mish=hard_mish,
-)
-
-_ACT_FN_JIT = dict(
-    silu=F.silu if _has_silu else swish_jit,
-    swish=F.silu if _has_silu else swish_jit,
-    mish=mish_jit,
-    hard_sigmoid=hard_sigmoid_jit,
-    hard_swish=hard_swish_jit,
-    hard_mish=hard_mish_jit
-)
-
-_ACT_FN_ME = dict(
-    silu=F.silu if _has_silu else swish_me,
-    swish=F.silu if _has_silu else swish_me,
-    mish=mish_me,
-    hard_sigmoid=hard_sigmoid_me,
-    hard_swish=hard_swish_me,
-    hard_mish=hard_mish_me,
-)
-
-_ACT_LAYER_DEFAULT = dict(
-    silu=nn.SiLU if _has_silu else Swish,
-    swish=nn.SiLU if _has_silu else Swish,
-    mish=Mish,
-    relu=nn.ReLU,
-    relu6=nn.ReLU6,
-    leaky_relu=nn.LeakyReLU,
-    elu=nn.ELU,
-    prelu=PReLU,
-    celu=nn.CELU,
-    selu=nn.SELU,
-    gelu=GELU,
-    sigmoid=Sigmoid,
-    tanh=Tanh,
-    hard_sigmoid=HardSigmoid,
-    hard_swish=HardSwish,
-    hard_mish=HardMish,
-)
-
-_ACT_LAYER_JIT = dict(
-    silu=nn.SiLU if _has_silu else SwishJit,
-    swish=nn.SiLU if _has_silu else SwishJit,
-    mish=MishJit,
-    hard_sigmoid=HardSigmoidJit,
-    hard_swish=HardSwishJit,
-    hard_mish=HardMishJit
-)
-
-_ACT_LAYER_ME = dict(
-    silu=nn.SiLU if _has_silu else SwishMe,
-    swish=nn.SiLU if _has_silu else SwishMe,
-    mish=MishMe,
-    hard_sigmoid=HardSigmoidMe,
-    hard_swish=HardSwishMe,
-    hard_mish=HardMishMe,
-)
-
-
-def get_act_fn(name='relu'):
-    """ Activation Function Factory
-    Fetching activation fns by name with this function allows export or torch script friendly
-    functions to be returned dynamically based on current config.
-    """
-    if not name:
-        return None
-    if not (is_no_jit() or is_exportable() or is_scriptable()):
-        # If not exporting or scripting the model, first look for a memory-efficient version with
-        # custom autograd, then fallback
-        if name in _ACT_FN_ME:
-            return _ACT_FN_ME[name]
-    if is_exportable() and name in ('silu', 'swish'):
-        # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
-        return swish
-    if not (is_no_jit() or is_exportable()):
-        if name in _ACT_FN_JIT:
-            return _ACT_FN_JIT[name]
-    return _ACT_FN_DEFAULT[name]
-
-
-def get_act_layer(name='relu'):
-    """ Activation Layer Factory
-    Fetching activation layers by name with this function allows export or torch script friendly
-    functions to be returned dynamically based on current config.
-    """
-    if not name:
-        return None
-    if not (is_no_jit() or is_exportable() or is_scriptable()):
-        if name in _ACT_LAYER_ME:
-            return _ACT_LAYER_ME[name]
-    if is_exportable() and name in ('silu', 'swish'):
-        # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
-        return Swish
-    if not (is_no_jit() or is_exportable()):
-        if name in _ACT_LAYER_JIT:
-            return _ACT_LAYER_JIT[name]
-    return _ACT_LAYER_DEFAULT[name]
-
-
-def create_act_layer(name, inplace=False, **kwargs):
-    act_layer = get_act_layer(name)
-    if act_layer is not None:
-        return act_layer(inplace=inplace, **kwargs)
-    else:
-        return None
diff --git a/AVLFormer/src/timm/models/layers/create_attn.py b/AVLFormer/src/timm/models/layers/create_attn.py
deleted file mode 100644
index ff20e5d..0000000
--- a/AVLFormer/src/timm/models/layers/create_attn.py
+++ /dev/null
@@ -1,45 +0,0 @@
-""" Select AttentionFactory Method
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-from .se import SEModule, EffectiveSEModule
-from .eca import EcaModule, CecaModule
-from .cbam import CbamModule, LightCbamModule
-
-
-def get_attn(attn_type):
-    if isinstance(attn_type, torch.nn.Module):
-        return attn_type
-    module_cls = None
-    if attn_type is not None:
-        if isinstance(attn_type, str):
-            attn_type = attn_type.lower()
-            if attn_type == 'se':
-                module_cls = SEModule
-            elif attn_type == 'ese':
-                module_cls = EffectiveSEModule
-            elif attn_type == 'eca':
-                module_cls = EcaModule
-            elif attn_type == 'ceca':
-                module_cls = CecaModule
-            elif attn_type == 'cbam':
-                module_cls = CbamModule
-            elif attn_type == 'lcbam':
-                module_cls = LightCbamModule
-            else:
-                assert False, "Invalid attn module (%s)" % attn_type
-        elif isinstance(attn_type, bool):
-            if attn_type:
-                module_cls = SEModule
-        else:
-            module_cls = attn_type
-    return module_cls
-
-
-def create_attn(attn_type, channels, **kwargs):
-    module_cls = get_attn(attn_type)
-    if module_cls is not None:
-        # NOTE: it's expected the first (positional) argument of all attention layers is the # input channels
-        return module_cls(channels, **kwargs)
-    return None
diff --git a/AVLFormer/src/timm/models/layers/create_conv2d.py b/AVLFormer/src/timm/models/layers/create_conv2d.py
deleted file mode 100644
index 3a0cc03..0000000
--- a/AVLFormer/src/timm/models/layers/create_conv2d.py
+++ /dev/null
@@ -1,31 +0,0 @@
-""" Create Conv2d Factory Method
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from .mixed_conv2d import MixedConv2d
-from .cond_conv2d import CondConv2d
-from .conv2d_same import create_conv2d_pad
-
-
-def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
-    """ Select a 2d convolution implementation based on arguments
-    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
-
-    Used extensively by EfficientNet, MobileNetv3 and related networks.
-    """
-    if isinstance(kernel_size, list):
-        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
-        assert 'groups' not in kwargs  # MixedConv groups are defined by kernel list
-        # We're going to use only lists for defining the MixedConv2d kernel groups,
-        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
-    else:
-        depthwise = kwargs.pop('depthwise', False)
-        # for DW out_channels must be multiple of in_channels as must have out_channels % groups == 0
-        groups = in_channels if depthwise else kwargs.pop('groups', 1)
-        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
-        else:
-            m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
-    return m
diff --git a/AVLFormer/src/timm/models/layers/create_norm_act.py b/AVLFormer/src/timm/models/layers/create_norm_act.py
deleted file mode 100644
index 5b56294..0000000
--- a/AVLFormer/src/timm/models/layers/create_norm_act.py
+++ /dev/null
@@ -1,83 +0,0 @@
-""" NormAct (Normalizaiton + Activation Layer) Factory
-
-Create norm + act combo modules that attempt to be backwards compatible with separate norm + act
-isntances in models. Where these are used it will be possible to swap separate BN + act layers with
-combined modules like IABN or EvoNorms.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import types
-import functools
-
-import torch
-import torch.nn as nn
-
-from .evo_norm import EvoNormBatch2d, EvoNormSample2d
-from .norm_act import BatchNormAct2d, GroupNormAct
-from .inplace_abn import InplaceAbn
-
-_NORM_ACT_TYPES = {BatchNormAct2d, GroupNormAct, EvoNormBatch2d, EvoNormSample2d, InplaceAbn}
-_NORM_ACT_REQUIRES_ARG = {BatchNormAct2d, GroupNormAct, InplaceAbn}  # requires act_layer arg to define act type
-
-
-def get_norm_act_layer(layer_class):
-    layer_class = layer_class.replace('_', '').lower()
-    if layer_class.startswith("batchnorm"):
-        layer = BatchNormAct2d
-    elif layer_class.startswith("groupnorm"):
-        layer = GroupNormAct
-    elif layer_class == "evonormbatch":
-        layer = EvoNormBatch2d
-    elif layer_class == "evonormsample":
-        layer = EvoNormSample2d
-    elif layer_class == "iabn" or layer_class == "inplaceabn":
-        layer = InplaceAbn
-    else:
-        assert False, "Invalid norm_act layer (%s)" % layer_class
-    return layer
-
-
-def create_norm_act(layer_type, num_features, apply_act=True, jit=False, **kwargs):
-    layer_parts = layer_type.split('-')  # e.g. batchnorm-leaky_relu
-    assert len(layer_parts) in (1, 2)
-    layer = get_norm_act_layer(layer_parts[0])
-    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection?
-    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
-    if jit:
-        layer_instance = torch.jit.script(layer_instance)
-    return layer_instance
-
-
-def convert_norm_act(norm_layer, act_layer):
-    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
-    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
-    norm_act_kwargs = {}
-
-    # unbind partial fn, so args can be rebound later
-    if isinstance(norm_layer, functools.partial):
-        norm_act_kwargs.update(norm_layer.keywords)
-        norm_layer = norm_layer.func
-
-    if isinstance(norm_layer, str):
-        norm_act_layer = get_norm_act_layer(norm_layer)
-    elif norm_layer in _NORM_ACT_TYPES:
-        norm_act_layer = norm_layer
-    elif isinstance(norm_layer,  types.FunctionType):
-        # if function type, must be a lambda/fn that creates a norm_act layer
-        norm_act_layer = norm_layer
-    else:
-        type_name = norm_layer.__name__.lower()
-        if type_name.startswith('batchnorm'):
-            norm_act_layer = BatchNormAct2d
-        elif type_name.startswith('groupnorm'):
-            norm_act_layer = GroupNormAct
-        else:
-            assert False, f"No equivalent norm_act layer for {type_name}"
-
-    if norm_act_layer in _NORM_ACT_REQUIRES_ARG:
-        # pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
-        # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types
-        norm_act_kwargs.setdefault('act_layer', act_layer)
-    if norm_act_kwargs:
-        norm_act_layer = functools.partial(norm_act_layer, **norm_act_kwargs)  # bind/rebind args
-    return norm_act_layer
diff --git a/AVLFormer/src/timm/models/layers/create_self_attn.py b/AVLFormer/src/timm/models/layers/create_self_attn.py
deleted file mode 100644
index 8c0984c..0000000
--- a/AVLFormer/src/timm/models/layers/create_self_attn.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from .bottleneck_attn import BottleneckAttn
-from .halo_attn import HaloAttn
-from .lambda_layer import LambdaLayer
-
-
-def get_self_attn(attn_type):
-    if attn_type == 'bottleneck':
-        return BottleneckAttn
-    elif attn_type == 'halo':
-        return HaloAttn
-    elif attn_type == 'lambda':
-        return LambdaLayer
-
-
-def create_self_attn(attn_type, dim, stride=1, **kwargs):
-    attn_fn = get_self_attn(attn_type)
-    return attn_fn(dim, stride=stride, **kwargs)
diff --git a/AVLFormer/src/timm/models/layers/drop.py b/AVLFormer/src/timm/models/layers/drop.py
deleted file mode 100644
index 6de9e3f..0000000
--- a/AVLFormer/src/timm/models/layers/drop.py
+++ /dev/null
@@ -1,168 +0,0 @@
-""" DropBlock, DropPath
-
-PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
-
-Papers:
-DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
-
-Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
-
-Code:
-DropBlock impl inspired by two Tensorflow impl that I liked:
- - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
- - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def drop_block_2d(
-        x, drop_prob: float = 0.1, block_size: int = 7,  gamma_scale: float = 1.0,
-        with_noise: bool = False, inplace: bool = False, batchwise: bool = False):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-
-    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
-    runs with success, but needs further validation and possibly optimization for lower runtime impact.
-    """
-    B, C, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    # seed_drop_rate, the gamma parameter
-    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
-        (W - block_size + 1) * (H - block_size + 1))
-
-    # Forces the block to be inside the feature map.
-    w_i, h_i = torch.meshgrid(torch.arange(W).to(x.device), torch.arange(H).to(x.device))
-    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \
-                  ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2))
-    valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype)
-
-    if batchwise:
-        # one mask for whole batch, quite a bit faster
-        uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device)
-    else:
-        uniform_noise = torch.rand_like(x)
-    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
-    block_mask = -F.max_pool2d(
-        -block_mask,
-        kernel_size=clipped_block_size,  # block_size,
-        stride=1,
-        padding=clipped_block_size // 2)
-
-    if with_noise:
-        normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
-        if inplace:
-            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
-        else:
-            x = x * block_mask + normal_noise * (1 - block_mask)
-    else:
-        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-
-
-def drop_block_fast_2d(
-        x: torch.Tensor, drop_prob: float = 0.1, block_size: int = 7,
-        gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, batchwise: bool = False):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-
-    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
-    block mask at edges.
-    """
-    B, C, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
-            (W - block_size + 1) * (H - block_size + 1))
-
-    if batchwise:
-        # one mask for whole batch, quite a bit faster
-        block_mask = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) < gamma
-    else:
-        # mask per batch element
-        block_mask = torch.rand_like(x) < gamma
-    block_mask = F.max_pool2d(
-        block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2)
-
-    if with_noise:
-        normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
-        if inplace:
-            x.mul_(1. - block_mask).add_(normal_noise * block_mask)
-        else:
-            x = x * (1. - block_mask) + normal_noise * block_mask
-    else:
-        block_mask = 1 - block_mask
-        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(dtype=x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-
-
-class DropBlock2d(nn.Module):
-    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-    """
-    def __init__(self,
-                 drop_prob=0.1,
-                 block_size=7,
-                 gamma_scale=1.0,
-                 with_noise=False,
-                 inplace=False,
-                 batchwise=False,
-                 fast=True):
-        super(DropBlock2d, self).__init__()
-        self.drop_prob = drop_prob
-        self.gamma_scale = gamma_scale
-        self.block_size = block_size
-        self.with_noise = with_noise
-        self.inplace = inplace
-        self.batchwise = batchwise
-        self.fast = fast  # FIXME finish comparisons of fast vs not
-
-    def forward(self, x):
-        if not self.training or not self.drop_prob:
-            return x
-        if self.fast:
-            return drop_block_fast_2d(
-                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
-        else:
-            return drop_block_2d(
-                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
-
-
-def drop_path(x, drop_prob: float = 0., training: bool = False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()  # binarize
-    output = x.div(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
diff --git a/AVLFormer/src/timm/models/layers/eca.py b/AVLFormer/src/timm/models/layers/eca.py
deleted file mode 100644
index 3a7f8b8..0000000
--- a/AVLFormer/src/timm/models/layers/eca.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""
-ECA module from ECAnet
-
-paper: ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks
-https://arxiv.org/abs/1910.03151
-
-Original ECA model borrowed from https://github.com/BangguWu/ECANet
-
-Modified circular ECA implementation and adaption for use in timm package
-by Chris Ha https://github.com/VRandme
-
-Original License:
-
-MIT License
-
-Copyright (c) 2019 BangguWu, Qilong Wang
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import math
-from torch import nn
-import torch.nn.functional as F
-
-
-class EcaModule(nn.Module):
-    """Constructs an ECA module.
-
-    Args:
-        channels: Number of channels of the input feature map for use in adaptive kernel sizes
-            for actual calculations according to channel.
-            gamma, beta: when channel is given parameters of mapping function
-            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
-            (default=None. if channel size not given, use k_size given for kernel size.)
-        kernel_size: Adaptive selection of kernel size (default=3)
-    """
-    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
-        super(EcaModule, self).__init__()
-        assert kernel_size % 2 == 1
-        if channels is not None:
-            t = int(abs(math.log(channels, 2) + beta) / gamma)
-            kernel_size = max(t if t % 2 else t + 1, 3)
-
-        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
-
-    def forward(self, x):
-        y = x.mean((2, 3)).view(x.shape[0], 1, -1)  # view for 1d conv
-        y = self.conv(y)
-        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
-        return x * y.expand_as(x)
-
-
-class CecaModule(nn.Module):
-    """Constructs a circular ECA module.
-
-    ECA module where the conv uses circular padding rather than zero padding.
-    Unlike the spatial dimension, the channels do not have inherent ordering nor
-    locality. Although this module in essence, applies such an assumption, it is unnecessary
-    to limit the channels on either "edge" from being circularly adapted to each other.
-    This will fundamentally increase connectivity and possibly increase performance metrics
-    (accuracy, robustness), without significantly impacting resource metrics
-    (parameter size, throughput,latency, etc)
-
-    Args:
-        channels: Number of channels of the input feature map for use in adaptive kernel sizes
-            for actual calculations according to channel.
-            gamma, beta: when channel is given parameters of mapping function
-            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
-            (default=None. if channel size not given, use k_size given for kernel size.)
-        kernel_size: Adaptive selection of kernel size (default=3)
-    """
-
-    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
-        super(CecaModule, self).__init__()
-        assert kernel_size % 2 == 1
-        if channels is not None:
-            t = int(abs(math.log(channels, 2) + beta) / gamma)
-            kernel_size = max(t if t % 2 else t + 1, 3)
-
-        # PyTorch circular padding mode is buggy as of pytorch 1.4
-        # see https://github.com/pytorch/pytorch/pull/17240
-        # implement manual circular padding
-        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False)
-        self.padding = (kernel_size - 1) // 2
-
-    def forward(self, x):
-        y = x.mean((2, 3)).view(x.shape[0], 1, -1)
-        # Manually implement circular padding, F.pad does not seemed to be bugged
-        y = F.pad(y, (self.padding, self.padding), mode='circular')
-        y = self.conv(y)
-        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
-        return x * y.expand_as(x)
diff --git a/AVLFormer/src/timm/models/layers/evo_norm.py b/AVLFormer/src/timm/models/layers/evo_norm.py
deleted file mode 100644
index 9023afd..0000000
--- a/AVLFormer/src/timm/models/layers/evo_norm.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""EvoNormB0 (Batched) and EvoNormS0 (Sample) in PyTorch
-
-An attempt at getting decent performing EvoNorms running in PyTorch.
-While currently faster than other impl, still quite a ways off the built-in BN
-in terms of memory usage and throughput (roughly 5x mem, 1/2 - 1/3x speed).
-
-Still very much a WIP, fiddling with buffer usage, in-place/jit optimizations, and layouts.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-import torch.nn as nn
-
-
-class EvoNormBatch2d(nn.Module):
-    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, drop_block=None):
-        super(EvoNormBatch2d, self).__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.momentum = momentum
-        self.eps = eps
-        param_shape = (1, num_features, 1, 1)
-        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
-        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
-        if apply_act:
-            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
-        self.register_buffer('running_var', torch.ones(1, num_features, 1, 1))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-        if self.apply_act:
-            nn.init.ones_(self.v)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        x_type = x.dtype
-        if self.training:
-            var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=True)
-            n = x.numel() / x.shape[1]
-            self.running_var.copy_(
-                var.detach() * self.momentum * (n / (n - 1)) + self.running_var * (1 - self.momentum))
-        else:
-            var = self.running_var
-
-        if self.apply_act:
-            v = self.v.to(dtype=x_type)
-            d = x * v + (x.var(dim=(2, 3), unbiased=False, keepdim=True) + self.eps).sqrt().to(dtype=x_type)
-            d = d.max((var + self.eps).sqrt().to(dtype=x_type))
-            x = x / d
-        return x * self.weight + self.bias
-
-
-class EvoNormSample2d(nn.Module):
-    def __init__(self, num_features, apply_act=True, groups=8, eps=1e-5, drop_block=None):
-        super(EvoNormSample2d, self).__init__()
-        self.apply_act = apply_act  # apply activation (non-linearity)
-        self.groups = groups
-        self.eps = eps
-        param_shape = (1, num_features, 1, 1)
-        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
-        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
-        if apply_act:
-            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.ones_(self.weight)
-        nn.init.zeros_(self.bias)
-        if self.apply_act:
-            nn.init.ones_(self.v)
-
-    def forward(self, x):
-        assert x.dim() == 4, 'expected 4D input'
-        B, C, H, W = x.shape
-        assert C % self.groups == 0
-        if self.apply_act:
-            n = x * (x * self.v).sigmoid()
-            x = x.reshape(B, self.groups, -1)
-            x = n.reshape(B, self.groups, -1) / (x.var(dim=-1, unbiased=False, keepdim=True) + self.eps).sqrt()
-            x = x.reshape(B, C, H, W)
-        return x * self.weight + self.bias
diff --git a/AVLFormer/src/timm/models/layers/halo_attn.py b/AVLFormer/src/timm/models/layers/halo_attn.py
deleted file mode 100644
index bd5d1b4..0000000
--- a/AVLFormer/src/timm/models/layers/halo_attn.py
+++ /dev/null
@@ -1,157 +0,0 @@
-""" Halo Self Attention
-
-Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
-    - https://arxiv.org/abs/2103.12731
-
-@misc{2103.12731,
-Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
-    Jonathon Shlens},
-Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
-Year = {2021},
-}
-
-Status:
-This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
-
-Trying to match the 'H1' variant in the paper, my parameter counts are 2M less and the model
-is extremely slow. Something isn't right. However, the models do appear to train and experimental
-variants with attn in C4 and/or C5 stages are tolerable speed.
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-from typing import Tuple, List
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-
-def rel_logits_1d(q, rel_k, permute_mask: List[int]):
-    """ Compute relative logits along one dimension
-
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    Args:
-        q: (batch, height, width, dim)
-        rel_k: (2 * window - 1, dim)
-        permute_mask: permute output dim according to this
-    """
-    B, H, W, dim = q.shape
-    rel_size = rel_k.shape[0]
-    win_size = (rel_size + 1) // 2
-
-    x = (q @ rel_k.transpose(-1, -2))
-    x = x.reshape(-1, W, rel_size)
-
-    # pad to shift from relative to absolute indexing
-    x_pad = F.pad(x, [0, 1]).flatten(1)
-    x_pad = F.pad(x_pad, [0, rel_size - W])
-
-    # reshape and slice out the padded elements
-    x_pad = x_pad.reshape(-1, W + 1, rel_size)
-    x = x_pad[:, :W, win_size - 1:]
-
-    # reshape and tile
-    x = x.reshape(B, H, 1, W, win_size).expand(-1, -1, win_size, -1, -1)
-    return x.permute(permute_mask)
-
-
-class PosEmbedRel(nn.Module):
-    """ Relative Position Embedding
-    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
-    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
-
-    """
-    def __init__(self, block_size, win_size, dim_head, scale):
-        """
-        Args:
-            block_size (int): block size
-            win_size (int): neighbourhood window size
-            dim_head (int): attention head dim
-            scale (float): scale factor (for init)
-        """
-        super().__init__()
-        self.block_size = block_size
-        self.dim_head = dim_head
-        self.scale = scale
-        self.height_rel = nn.Parameter(torch.randn(win_size * 2 - 1, dim_head) * self.scale)
-        self.width_rel = nn.Parameter(torch.randn(win_size * 2 - 1, dim_head) * self.scale)
-
-    def forward(self, q):
-        B, BB, HW, _ = q.shape
-
-        # relative logits in width dimension.
-        q = q.reshape(-1, self.block_size, self.block_size, self.dim_head)
-        rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4))
-
-        # relative logits in height dimension.
-        q = q.transpose(1, 2)
-        rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2))
-
-        rel_logits = rel_logits_h + rel_logits_w
-        rel_logits = rel_logits.reshape(B, BB, HW, -1)
-        return rel_logits
-
-
-class HaloAttn(nn.Module):
-    """ Halo Attention
-
-    Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
-        - https://arxiv.org/abs/2103.12731
-    """
-    def __init__(
-            self, dim, dim_out=None, stride=1, num_heads=8, dim_head=16, block_size=8, halo_size=3, qkv_bias=False):
-        super().__init__()
-        dim_out = dim_out or dim
-        assert dim_out % num_heads == 0
-        self.stride = stride
-        self.num_heads = num_heads
-        self.dim_head = dim_head
-        self.dim_qk = num_heads * dim_head
-        self.dim_v = dim_out
-        self.block_size = block_size
-        self.halo_size = halo_size
-        self.win_size = block_size + halo_size * 2  # neighbourhood window size
-        self.scale = self.dim_head ** -0.5
-
-        # FIXME not clear if this stride behaviour is what the paper intended, not really clear
-        # Also, the paper mentions using a 3D conv for dealing with the blocking/gather, and leaving
-        # data in unfolded block form. I haven't wrapped my head around how that'd look.
-        self.q = nn.Conv2d(dim, self.dim_qk, 1, stride=self.stride, bias=qkv_bias)
-        self.kv = nn.Conv2d(dim, self.dim_qk + self.dim_v, 1, bias=qkv_bias)
-
-        self.pos_embed = PosEmbedRel(
-            block_size=block_size // self.stride, win_size=self.win_size, dim_head=self.dim_head, scale=self.scale)
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert H % self.block_size == 0 and W % self.block_size == 0
-        num_h_blocks = H // self.block_size
-        num_w_blocks = W // self.block_size
-        num_blocks = num_h_blocks * num_w_blocks
-
-        q = self.q(x)
-        q = F.unfold(q, kernel_size=self.block_size // self.stride, stride=self.block_size // self.stride)
-        # B, num_heads * dim_head * block_size ** 2, num_blocks
-        q = q.reshape(B * self.num_heads, self.dim_head, -1, num_blocks).transpose(1, 3)
-        # B * num_heads, num_blocks, block_size ** 2, dim_head
-
-        kv = self.kv(x)
-        # FIXME I 'think' this unfold does what I want it to, but I should investigate
-        k = F.unfold(kv, kernel_size=self.win_size, stride=self.block_size, padding=self.halo_size)
-        k = k.reshape(
-            B * self.num_heads, self.dim_head + (self.dim_v // self.num_heads), -1, num_blocks).transpose(1, 3)
-        k, v = torch.split(k, [self.dim_head, self.dim_v // self.num_heads], dim=-1)
-
-        attn_logits = (q @ k.transpose(-1, -2)) * self.scale  # FIXME should usual attn scale be applied?
-        attn_logits = attn_logits + self.pos_embed(q)  # B * num_heads, block_size ** 2, win_size ** 2
-
-        attn_out = attn_logits.softmax(dim=-1)
-        attn_out = (attn_out @ v).transpose(1, 3)  # B * num_heads, dim_v // num_heads, block_size ** 2, num_blocks
-        attn_out = F.fold(
-            attn_out.reshape(B, -1, num_blocks),
-            (H // self.stride, W // self.stride),
-            kernel_size=self.block_size // self.stride, stride=self.block_size // self.stride)
-        # B, dim_out, H // stride, W // stride
-        return attn_out
diff --git a/AVLFormer/src/timm/models/layers/helpers.py b/AVLFormer/src/timm/models/layers/helpers.py
deleted file mode 100644
index 7a738d5..0000000
--- a/AVLFormer/src/timm/models/layers/helpers.py
+++ /dev/null
@@ -1,31 +0,0 @@
-""" Layer/Module Helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from itertools import repeat
-import collections.abc
-
-
-# From PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-
-
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    min_value = min_value or divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
diff --git a/AVLFormer/src/timm/models/layers/inplace_abn.py b/AVLFormer/src/timm/models/layers/inplace_abn.py
deleted file mode 100644
index 3aae7cf..0000000
--- a/AVLFormer/src/timm/models/layers/inplace_abn.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import torch
-from torch import nn as nn
-
-try:
-    from inplace_abn.functions import inplace_abn, inplace_abn_sync
-    has_iabn = True
-except ImportError:
-    has_iabn = False
-
-    def inplace_abn(x, weight, bias, running_mean, running_var,
-                    training=True, momentum=0.1, eps=1e-05, activation="leaky_relu", activation_param=0.01):
-        raise ImportError(
-            "Please install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.12'")
-
-    def inplace_abn_sync(**kwargs):
-        inplace_abn(**kwargs)
-
-
-class InplaceAbn(nn.Module):
-    """Activated Batch Normalization
-
-    This gathers a BatchNorm and an activation function in a single module
-
-    Parameters
-    ----------
-    num_features : int
-        Number of feature channels in the input and output.
-    eps : float
-        Small constant to prevent numerical issues.
-    momentum : float
-        Momentum factor applied to compute running statistics.
-    affine : bool
-        If `True` apply learned scale and shift transformation after normalization.
-    act_layer : str or nn.Module type
-        Name or type of the activation functions, one of: `leaky_relu`, `elu`
-    act_param : float
-        Negative slope for the `leaky_relu` activation.
-    """
-
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
-                 act_layer="leaky_relu", act_param=0.01, drop_block=None):
-        super(InplaceAbn, self).__init__()
-        self.num_features = num_features
-        self.affine = affine
-        self.eps = eps
-        self.momentum = momentum
-        if apply_act:
-            if isinstance(act_layer, str):
-                assert act_layer in ('leaky_relu', 'elu', 'identity', '')
-                self.act_name = act_layer if act_layer else 'identity'
-            else:
-                # convert act layer passed as type to string
-                if act_layer == nn.ELU:
-                    self.act_name = 'elu'
-                elif act_layer == nn.LeakyReLU:
-                    self.act_name = 'leaky_relu'
-                elif act_layer == nn.Identity:
-                    self.act_name = 'identity'
-                else:
-                    assert False, f'Invalid act layer {act_layer.__name__} for IABN'
-        else:
-            self.act_name = 'identity'
-        self.act_param = act_param
-        if self.affine:
-            self.weight = nn.Parameter(torch.ones(num_features))
-            self.bias = nn.Parameter(torch.zeros(num_features))
-        else:
-            self.register_parameter('weight', None)
-            self.register_parameter('bias', None)
-        self.register_buffer('running_mean', torch.zeros(num_features))
-        self.register_buffer('running_var', torch.ones(num_features))
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.constant_(self.running_mean, 0)
-        nn.init.constant_(self.running_var, 1)
-        if self.affine:
-            nn.init.constant_(self.weight, 1)
-            nn.init.constant_(self.bias, 0)
-
-    def forward(self, x):
-        output = inplace_abn(
-            x, self.weight, self.bias, self.running_mean, self.running_var,
-            self.training, self.momentum, self.eps, self.act_name, self.act_param)
-        if isinstance(output, tuple):
-            output = output[0]
-        return output
diff --git a/AVLFormer/src/timm/models/layers/lambda_layer.py b/AVLFormer/src/timm/models/layers/lambda_layer.py
deleted file mode 100644
index bdaebb5..0000000
--- a/AVLFormer/src/timm/models/layers/lambda_layer.py
+++ /dev/null
@@ -1,78 +0,0 @@
-""" Lambda Layer
-
-Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention`
-    - https://arxiv.org/abs/2102.08602
-
-@misc{2102.08602,
-Author = {Irwan Bello},
-Title = {LambdaNetworks: Modeling Long-Range Interactions Without Attention},
-Year = {2021},
-}
-
-Status:
-This impl is a WIP. Code snippets in the paper were used as reference but
-good chance some details are missing/wrong.
-
-I've only implemented local lambda conv based pos embeddings.
-
-For a PyTorch impl that includes other embedding options checkout
-https://github.com/lucidrains/lambda-networks
-
-Hacked together by / Copyright 2021 Ross Wightman
-"""
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-
-
-class LambdaLayer(nn.Module):
-    """Lambda Layer w/ lambda conv position embedding
-
-    Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention`
-        - https://arxiv.org/abs/2102.08602
-    """
-    def __init__(
-            self,
-            dim, dim_out=None, stride=1, num_heads=4, dim_head=16, r=5, qkv_bias=False):
-        super().__init__()
-        self.dim_out = dim_out or dim
-        self.dim_k = dim_head  # query depth 'k'
-        self.num_heads = num_heads
-        assert self.dim_out % num_heads == 0, ' should be divided by num_heads'
-        self.dim_v = self.dim_out // num_heads  # value depth 'v'
-        self.r = r  # relative position neighbourhood (lambda conv kernel size)
-
-        self.qkv = nn.Conv2d(
-            dim,
-            num_heads * dim_head + dim_head + self.dim_v,
-            kernel_size=1, bias=qkv_bias)
-        self.norm_q = nn.BatchNorm2d(num_heads * dim_head)
-        self.norm_v = nn.BatchNorm2d(self.dim_v)
-
-        # NOTE currently only supporting the local lambda convolutions for positional
-        self.conv_lambda = nn.Conv3d(1, dim_head, (r, r, 1), padding=(r // 2, r // 2, 0))
-
-        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        M = H * W
-
-        qkv = self.qkv(x)
-        q, k, v = torch.split(qkv, [
-            self.num_heads * self.dim_k, self.dim_k, self.dim_v], dim=1)
-        q = self.norm_q(q).reshape(B, self.num_heads, self.dim_k, M).transpose(-1, -2)  # B, num_heads, M, K
-        v = self.norm_v(v).reshape(B, self.dim_v, M).transpose(-1, -2)  # B, M, V
-        k = F.softmax(k.reshape(B, self.dim_k, M), dim=-1)  # B, K, M
-
-        content_lam = k @ v  # B, K, V
-        content_out = q @ content_lam.unsqueeze(1)  # B, num_heads, M, V
-
-        position_lam = self.conv_lambda(v.reshape(B, 1, H, W, self.dim_v))  # B, H, W, V, K
-        position_lam = position_lam.reshape(B, 1, self.dim_k, H * W, self.dim_v).transpose(2, 3)  # B, 1, M, K, V
-        position_out = (q.unsqueeze(-2) @ position_lam).squeeze(-2)  # B, num_heads, M, V
-
-        out = (content_out + position_out).transpose(3, 1).reshape(B, C, H, W)  # B, C (num_heads * V), H, W
-        out = self.pool(out)
-        return out
diff --git a/AVLFormer/src/timm/models/layers/linear.py b/AVLFormer/src/timm/models/layers/linear.py
deleted file mode 100644
index 38fe338..0000000
--- a/AVLFormer/src/timm/models/layers/linear.py
+++ /dev/null
@@ -1,19 +0,0 @@
-""" Linear layer (alternate definition)
-"""
-import torch
-import torch.nn.functional as F
-from torch import nn as nn
-
-
-class Linear(nn.Linear):
-    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
-
-    Wraps torch.nn.Linear to support AMP + torchscript usage by manually casting
-    weight & bias to input.dtype to work around an issue w/ torch.addmm in this use case.
-    """
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if torch.jit.is_scripting():
-            bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
-            return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
-        else:
-            return F.linear(input, self.weight, self.bias)
diff --git a/AVLFormer/src/timm/models/layers/median_pool.py b/AVLFormer/src/timm/models/layers/median_pool.py
deleted file mode 100644
index 40bd71a..0000000
--- a/AVLFormer/src/timm/models/layers/median_pool.py
+++ /dev/null
@@ -1,49 +0,0 @@
-""" Median Pool
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch.nn as nn
-import torch.nn.functional as F
-from .helpers import to_2tuple, to_4tuple
-
-
-class MedianPool2d(nn.Module):
-    """ Median pool (usable as median filter when stride=1) module.
-
-    Args:
-         kernel_size: size of pooling kernel, int or 2-tuple
-         stride: pool stride, int or 2-tuple
-         padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
-         same: override padding and enforce same padding, boolean
-    """
-    def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
-        super(MedianPool2d, self).__init__()
-        self.k = to_2tuple(kernel_size)
-        self.stride = to_2tuple(stride)
-        self.padding = to_4tuple(padding)  # convert to l, r, t, b
-        self.same = same
-
-    def _padding(self, x):
-        if self.same:
-            ih, iw = x.size()[2:]
-            if ih % self.stride[0] == 0:
-                ph = max(self.k[0] - self.stride[0], 0)
-            else:
-                ph = max(self.k[0] - (ih % self.stride[0]), 0)
-            if iw % self.stride[1] == 0:
-                pw = max(self.k[1] - self.stride[1], 0)
-            else:
-                pw = max(self.k[1] - (iw % self.stride[1]), 0)
-            pl = pw // 2
-            pr = pw - pl
-            pt = ph // 2
-            pb = ph - pt
-            padding = (pl, pr, pt, pb)
-        else:
-            padding = self.padding
-        return padding
-
-    def forward(self, x):
-        x = F.pad(x, self._padding(x), mode='reflect')
-        x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1])
-        x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0]
-        return x
diff --git a/AVLFormer/src/timm/models/layers/mixed_conv2d.py b/AVLFormer/src/timm/models/layers/mixed_conv2d.py
deleted file mode 100644
index fa0ce56..0000000
--- a/AVLFormer/src/timm/models/layers/mixed_conv2d.py
+++ /dev/null
@@ -1,51 +0,0 @@
-""" PyTorch Mixed Convolution
-
-Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import torch
-from torch import nn as nn
-
-from .conv2d_same import create_conv2d_pad
-
-
-def _split_channels(num_chan, num_groups):
-    split = [num_chan // num_groups for _ in range(num_groups)]
-    split[0] += num_chan - sum(split)
-    return split
-
-
-class MixedConv2d(nn.ModuleDict):
-    """ Mixed Grouped Convolution
-
-    Based on MDConv and GroupedConv in MixNet impl:
-      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
-        super(MixedConv2d, self).__init__()
-
-        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
-        num_groups = len(kernel_size)
-        in_splits = _split_channels(in_channels, num_groups)
-        out_splits = _split_channels(out_channels, num_groups)
-        self.in_channels = sum(in_splits)
-        self.out_channels = sum(out_splits)
-        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            conv_groups = in_ch if depthwise else 1
-            # use add_module to keep key space clean
-            self.add_module(
-                str(idx),
-                create_conv2d_pad(
-                    in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
-            )
-        self.splits = in_splits
-
-    def forward(self, x):
-        x_split = torch.split(x, self.splits, 1)
-        x_out = [c(x_split[i]) for i, c in enumerate(self.values())]
-        x = torch.cat(x_out, 1)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/norm.py b/AVLFormer/src/timm/models/layers/norm.py
deleted file mode 100644
index 2925e5c..0000000
--- a/AVLFormer/src/timm/models/layers/norm.py
+++ /dev/null
@@ -1,14 +0,0 @@
-""" Normalization layers and wrappers
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class GroupNorm(nn.GroupNorm):
-    def __init__(self, num_channels, num_groups, eps=1e-5, affine=True):
-        # NOTE num_channels is swapped to first arg for consistency in swapping norm layers with BN
-        super().__init__(num_groups, num_channels, eps=eps, affine=affine)
-
-    def forward(self, x):
-        return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
diff --git a/AVLFormer/src/timm/models/layers/norm_act.py b/AVLFormer/src/timm/models/layers/norm_act.py
deleted file mode 100644
index 02cabe8..0000000
--- a/AVLFormer/src/timm/models/layers/norm_act.py
+++ /dev/null
@@ -1,85 +0,0 @@
-""" Normalization + Activation Layers
-"""
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-from .create_act import get_act_layer
-
-
-class BatchNormAct2d(nn.BatchNorm2d):
-    """BatchNorm + Activation
-
-    This module performs BatchNorm + Activation in a manner that will remain backwards
-    compatible with weights trained with separate bn, act. This is why we inherit from BN
-    instead of composing it as a .bn member.
-    """
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
-                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
-        super(BatchNormAct2d, self).__init__(
-            num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
-        if isinstance(act_layer, str):
-            act_layer = get_act_layer(act_layer)
-        if act_layer is not None and apply_act:
-            act_args = dict(inplace=True) if inplace else {}
-            self.act = act_layer(**act_args)
-        else:
-            self.act = nn.Identity()
-
-    def _forward_jit(self, x):
-        """ A cut & paste of the contents of the PyTorch BatchNorm2d forward function
-        """
-        # exponential_average_factor is self.momentum set to
-        # (when it is available) only so that if gets updated
-        # in ONNX graph when this node is exported to ONNX.
-        if self.momentum is None:
-            exponential_average_factor = 0.0
-        else:
-            exponential_average_factor = self.momentum
-
-        if self.training and self.track_running_stats:
-            # TODO: if statement only here to tell the jit to skip emitting this when it is None
-            if self.num_batches_tracked is not None:
-                self.num_batches_tracked += 1
-                if self.momentum is None:  # use cumulative moving average
-                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
-                else:  # use exponential moving average
-                    exponential_average_factor = self.momentum
-
-        x = F.batch_norm(
-                x, self.running_mean, self.running_var, self.weight, self.bias,
-                self.training or not self.track_running_stats,
-                exponential_average_factor, self.eps)
-        return x
-
-    @torch.jit.ignore
-    def _forward_python(self, x):
-        return super(BatchNormAct2d, self).forward(x)
-
-    def forward(self, x):
-        # FIXME cannot call parent forward() and maintain jit.script compatibility?
-        if torch.jit.is_scripting():
-            x = self._forward_jit(x)
-        else:
-            x = self._forward_python(x)
-        x = self.act(x)
-        return x
-
-
-class GroupNormAct(nn.GroupNorm):
-    # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
-    def __init__(self, num_channels, num_groups, eps=1e-5, affine=True,
-                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
-        super(GroupNormAct, self).__init__(num_groups, num_channels, eps=eps, affine=affine)
-        if isinstance(act_layer, str):
-            act_layer = get_act_layer(act_layer)
-        if act_layer is not None and apply_act:
-            act_args = dict(inplace=True) if inplace else {}
-            self.act = act_layer(**act_args)
-        else:
-            self.act = nn.Identity()
-
-    def forward(self, x):
-        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
-        x = self.act(x)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/padding.py b/AVLFormer/src/timm/models/layers/padding.py
deleted file mode 100644
index 34afc37..0000000
--- a/AVLFormer/src/timm/models/layers/padding.py
+++ /dev/null
@@ -1,56 +0,0 @@
-""" Padding Helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import math
-from typing import List, Tuple
-
-import torch.nn.functional as F
-
-
-# Calculate symmetric padding for a convolution
-def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
-def get_same_padding(x: int, k: int, s: int, d: int):
-    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
-
-
-# Can SAME padding for given args be done statically?
-def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
-    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
-
-
-# Dynamically pad input x with 'SAME' padding for conv with specified args
-def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
-    ih, iw = x.size()[-2:]
-    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
-    if pad_h > 0 or pad_w > 0:
-        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
-    return x
-
-
-def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
-    dynamic = False
-    if isinstance(padding, str):
-        # for any string padding, the padding will be calculated for you, one of three ways
-        padding = padding.lower()
-        if padding == 'same':
-            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
-            if is_static_pad(kernel_size, **kwargs):
-                # static case, no extra overhead
-                padding = get_padding(kernel_size, **kwargs)
-            else:
-                # dynamic 'SAME' padding, has runtime/GPU memory overhead
-                padding = 0
-                dynamic = True
-        elif padding == 'valid':
-            # 'VALID' padding, same as padding=0
-            padding = 0
-        else:
-            # Default to PyTorch style 'same'-ish symmetric padding
-            padding = get_padding(kernel_size, **kwargs)
-    return padding, dynamic
diff --git a/AVLFormer/src/timm/models/layers/pool2d_same.py b/AVLFormer/src/timm/models/layers/pool2d_same.py
deleted file mode 100644
index 5fcd0f1..0000000
--- a/AVLFormer/src/timm/models/layers/pool2d_same.py
+++ /dev/null
@@ -1,71 +0,0 @@
-""" AvgPool2d w/ Same Padding
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import List, Tuple, Optional
-
-from .helpers import to_2tuple
-from .padding import pad_same, get_padding_value
-
-
-def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
-                    ceil_mode: bool = False, count_include_pad: bool = True):
-    # FIXME how to deal with count_include_pad vs not for external padding?
-    x = pad_same(x, kernel_size, stride)
-    return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
-
-
-class AvgPool2dSame(nn.AvgPool2d):
-    """ Tensorflow like 'SAME' wrapper for 2D average pooling
-    """
-    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
-
-    def forward(self, x):
-        return avg_pool2d_same(
-            x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
-
-
-def max_pool2d_same(
-        x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
-        dilation: List[int] = (1, 1), ceil_mode: bool = False):
-    x = pad_same(x, kernel_size, stride, value=-float('inf'))
-    return F.max_pool2d(x, kernel_size, stride, (0, 0), dilation, ceil_mode)
-
-
-class MaxPool2dSame(nn.MaxPool2d):
-    """ Tensorflow like 'SAME' wrapper for 2D max pooling
-    """
-    def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False, count_include_pad=True):
-        kernel_size = to_2tuple(kernel_size)
-        stride = to_2tuple(stride)
-        dilation = to_2tuple(dilation)
-        super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode, count_include_pad)
-
-    def forward(self, x):
-        return max_pool2d_same(x, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
-
-
-def create_pool2d(pool_type, kernel_size, stride=None, **kwargs):
-    stride = stride or kernel_size
-    padding = kwargs.pop('padding', '')
-    padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs)
-    if is_dynamic:
-        if pool_type == 'avg':
-            return AvgPool2dSame(kernel_size, stride=stride, **kwargs)
-        elif pool_type == 'max':
-            return MaxPool2dSame(kernel_size, stride=stride, **kwargs)
-        else:
-            assert False, f'Unsupported pool type {pool_type}'
-    else:
-        if pool_type == 'avg':
-            return nn.AvgPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
-        elif pool_type == 'max':
-            return nn.MaxPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
-        else:
-            assert False, f'Unsupported pool type {pool_type}'
diff --git a/AVLFormer/src/timm/models/layers/se.py b/AVLFormer/src/timm/models/layers/se.py
deleted file mode 100644
index 54c0ef3..0000000
--- a/AVLFormer/src/timm/models/layers/se.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from torch import nn as nn
-import torch.nn.functional as F
-
-from .create_act import create_act_layer
-from .helpers import make_divisible
-
-
-class SEModule(nn.Module):
-    """ SE Module as defined in original SE-Nets with a few additions
-    Additions include:
-        * min_channels can be specified to keep reduced channel count at a minimum (default: 8)
-        * divisor can be specified to keep channels rounded to specified values (default: 1)
-        * reduction channels can be specified directly by arg (if reduction_channels is set)
-        * reduction channels can be specified by float ratio (if reduction_ratio is set)
-    """
-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, gate_layer='sigmoid',
-                 reduction_ratio=None, reduction_channels=None, min_channels=8, divisor=1):
-        super(SEModule, self).__init__()
-        if reduction_channels is not None:
-            reduction_channels = reduction_channels  # direct specification highest priority, no rounding/min done
-        elif reduction_ratio is not None:
-            reduction_channels = make_divisible(channels * reduction_ratio, divisor, min_channels)
-        else:
-            reduction_channels = make_divisible(channels // reduction, divisor, min_channels)
-        self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True)
-        self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.fc1(x_se)
-        x_se = self.act(x_se)
-        x_se = self.fc2(x_se)
-        return x * self.gate(x_se)
-
-
-class EffectiveSEModule(nn.Module):
-    """ 'Effective Squeeze-Excitation
-    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
-    """
-    def __init__(self, channels, gate_layer='hard_sigmoid'):
-        super(EffectiveSEModule, self).__init__()
-        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
-        self.gate = create_act_layer(gate_layer, inplace=True)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.fc(x_se)
-        return x * self.gate(x_se)
diff --git a/AVLFormer/src/timm/models/layers/selective_kernel.py b/AVLFormer/src/timm/models/layers/selective_kernel.py
deleted file mode 100644
index 10bfd0e..0000000
--- a/AVLFormer/src/timm/models/layers/selective_kernel.py
+++ /dev/null
@@ -1,118 +0,0 @@
-""" Selective Kernel Convolution/Attention
-
-Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-from torch import nn as nn
-
-from .conv_bn_act import ConvBnAct
-
-
-def _kernel_valid(k):
-    if isinstance(k, (list, tuple)):
-        for ki in k:
-            return _kernel_valid(ki)
-    assert k >= 3 and k % 2
-
-
-class SelectiveKernelAttn(nn.Module):
-    def __init__(self, channels, num_paths=2, attn_channels=32,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
-        """ Selective Kernel Attention Module
-
-        Selective Kernel attention mechanism factored out into its own module.
-
-        """
-        super(SelectiveKernelAttn, self).__init__()
-        self.num_paths = num_paths
-        self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
-        self.bn = norm_layer(attn_channels)
-        self.act = act_layer(inplace=True)
-        self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        assert x.shape[1] == self.num_paths
-        x = x.sum(1).mean((2, 3), keepdim=True)
-        x = self.fc_reduce(x)
-        x = self.bn(x)
-        x = self.act(x)
-        x = self.fc_select(x)
-        B, C, H, W = x.shape
-        x = x.view(B, self.num_paths, C // self.num_paths, H, W)
-        x = torch.softmax(x, dim=1)
-        return x
-
-
-class SelectiveKernelConv(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size=None, stride=1, dilation=1, groups=1,
-                 attn_reduction=16, min_attn_channels=32, keep_3x3=True, split_input=False,
-                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
-        """ Selective Kernel Convolution Module
-
-        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
-
-        Largest change is the input split, which divides the input channels across each convolution path, this can
-        be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps
-        the parameter count from ballooning when the convolutions themselves don't have groups, but still provides
-        a noteworthy increase in performance over similar param count models without this attention layer. -Ross W
-
-        Args:
-            in_channels (int):  module input (feature) channel count
-            out_channels (int):  module output (feature) channel count
-            kernel_size (int, list): kernel size for each convolution branch
-            stride (int): stride for convolutions
-            dilation (int): dilation for module as a whole, impacts dilation of each branch
-            groups (int): number of groups for each branch
-            attn_reduction (int, float): reduction factor for attention features
-            min_attn_channels (int): minimum attention feature channels
-            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
-            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
-                can be viewed as grouping by path, output expands to module out_channels count
-            drop_block (nn.Module): drop block module
-            act_layer (nn.Module): activation layer to use
-            norm_layer (nn.Module): batchnorm/norm layer to use
-        """
-        super(SelectiveKernelConv, self).__init__()
-        kernel_size = kernel_size or [3, 5]  # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation
-        _kernel_valid(kernel_size)
-        if not isinstance(kernel_size, list):
-            kernel_size = [kernel_size] * 2
-        if keep_3x3:
-            dilation = [dilation * (k - 1) // 2 for k in kernel_size]
-            kernel_size = [3] * len(kernel_size)
-        else:
-            dilation = [dilation] * len(kernel_size)
-        self.num_paths = len(kernel_size)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.split_input = split_input
-        if self.split_input:
-            assert in_channels % self.num_paths == 0
-            in_channels = in_channels // self.num_paths
-        groups = min(out_channels, groups)
-
-        conv_kwargs = dict(
-            stride=stride, groups=groups, drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer,
-            aa_layer=aa_layer)
-        self.paths = nn.ModuleList([
-            ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
-            for k, d in zip(kernel_size, dilation)])
-
-        attn_channels = max(int(out_channels / attn_reduction), min_attn_channels)
-        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
-        self.drop_block = drop_block
-
-    def forward(self, x):
-        if self.split_input:
-            x_split = torch.split(x, self.in_channels // self.num_paths, 1)
-            x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)]
-        else:
-            x_paths = [op(x) for op in self.paths]
-        x = torch.stack(x_paths, dim=1)
-        x_attn = self.attn(x)
-        x = x * x_attn
-        x = torch.sum(x, dim=1)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/separable_conv.py b/AVLFormer/src/timm/models/layers/separable_conv.py
deleted file mode 100644
index 1ddcb4e..0000000
--- a/AVLFormer/src/timm/models/layers/separable_conv.py
+++ /dev/null
@@ -1,73 +0,0 @@
-""" Depthwise Separable Conv Modules
-
-Basic DWS convs. Other variations of DWS exist with batch norm or activations between the
-DW and PW convs such as the Depthwise modules in MobileNetV2 / EfficientNet and Xception.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from torch import nn as nn
-
-from .create_conv2d import create_conv2d
-from .create_norm_act import convert_norm_act
-
-
-class SeparableConvBnAct(nn.Module):
-    """ Separable Conv w/ trailing Norm and Activation
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
-                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU,
-                 apply_act=True, drop_block=None):
-        super(SeparableConvBnAct, self).__init__()
-
-        self.conv_dw = create_conv2d(
-            in_channels, int(in_channels * channel_multiplier), kernel_size,
-            stride=stride, dilation=dilation, padding=padding, depthwise=True)
-
-        self.conv_pw = create_conv2d(
-            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
-
-        norm_act_layer = convert_norm_act(norm_layer, act_layer)
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block)
-
-    @property
-    def in_channels(self):
-        return self.conv_dw.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv_pw.out_channels
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.conv_pw(x)
-        if self.bn is not None:
-            x = self.bn(x)
-        return x
-
-
-class SeparableConv2d(nn.Module):
-    """ Separable Conv
-    """
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
-                 channel_multiplier=1.0, pw_kernel_size=1):
-        super(SeparableConv2d, self).__init__()
-
-        self.conv_dw = create_conv2d(
-            in_channels, int(in_channels * channel_multiplier), kernel_size,
-            stride=stride, dilation=dilation, padding=padding, depthwise=True)
-
-        self.conv_pw = create_conv2d(
-            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
-
-    @property
-    def in_channels(self):
-        return self.conv_dw.in_channels
-
-    @property
-    def out_channels(self):
-        return self.conv_pw.out_channels
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.conv_pw(x)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/space_to_depth.py b/AVLFormer/src/timm/models/layers/space_to_depth.py
deleted file mode 100644
index a7e8e0b..0000000
--- a/AVLFormer/src/timm/models/layers/space_to_depth.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class SpaceToDepth(nn.Module):
-    def __init__(self, block_size=4):
-        super().__init__()
-        assert block_size == 4
-        self.bs = block_size
-
-    def forward(self, x):
-        N, C, H, W = x.size()
-        x = x.view(N, C, H // self.bs, self.bs, W // self.bs, self.bs)  # (N, C, H//bs, bs, W//bs, bs)
-        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # (N, bs, bs, C, H//bs, W//bs)
-        x = x.view(N, C * (self.bs ** 2), H // self.bs, W // self.bs)  # (N, C*bs^2, H//bs, W//bs)
-        return x
-
-
-@torch.jit.script
-class SpaceToDepthJit(object):
-    def __call__(self, x: torch.Tensor):
-        # assuming hard-coded that block_size==4 for acceleration
-        N, C, H, W = x.size()
-        x = x.view(N, C, H // 4, 4, W // 4, 4)  # (N, C, H//bs, bs, W//bs, bs)
-        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # (N, bs, bs, C, H//bs, W//bs)
-        x = x.view(N, C * 16, H // 4, W // 4)  # (N, C*bs^2, H//bs, W//bs)
-        return x
-
-
-class SpaceToDepthModule(nn.Module):
-    def __init__(self, no_jit=False):
-        super().__init__()
-        if not no_jit:
-            self.op = SpaceToDepthJit()
-        else:
-            self.op = SpaceToDepth()
-
-    def forward(self, x):
-        return self.op(x)
-
-
-class DepthToSpace(nn.Module):
-
-    def __init__(self, block_size):
-        super().__init__()
-        self.bs = block_size
-
-    def forward(self, x):
-        N, C, H, W = x.size()
-        x = x.view(N, self.bs, self.bs, C // (self.bs ** 2), H, W)  # (N, bs, bs, C//bs^2, H, W)
-        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # (N, C//bs^2, H, bs, W, bs)
-        x = x.view(N, C // (self.bs ** 2), H * self.bs, W * self.bs)  # (N, C//bs^2, H * bs, W * bs)
-        return x
diff --git a/AVLFormer/src/timm/models/layers/split_attn.py b/AVLFormer/src/timm/models/layers/split_attn.py
deleted file mode 100644
index 5615aa0..0000000
--- a/AVLFormer/src/timm/models/layers/split_attn.py
+++ /dev/null
@@ -1,88 +0,0 @@
-""" Split Attention Conv2d (for ResNeSt Models)
-
-Paper: `ResNeSt: Split-Attention Networks` - /https://arxiv.org/abs/2004.08955
-
-Adapted from original PyTorch impl at https://github.com/zhanghang1989/ResNeSt
-
-Modified for torchscript compat, performance, and consistency with timm by Ross Wightman
-"""
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-
-class RadixSoftmax(nn.Module):
-    def __init__(self, radix, cardinality):
-        super(RadixSoftmax, self).__init__()
-        self.radix = radix
-        self.cardinality = cardinality
-
-    def forward(self, x):
-        batch = x.size(0)
-        if self.radix > 1:
-            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
-            x = F.softmax(x, dim=1)
-            x = x.reshape(batch, -1)
-        else:
-            x = torch.sigmoid(x)
-        return x
-
-
-class SplitAttnConv2d(nn.Module):
-    """Split-Attention Conv2d
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,
-                 dilation=1, groups=1, bias=False, radix=2, reduction_factor=4,
-                 act_layer=nn.ReLU, norm_layer=None, drop_block=None, **kwargs):
-        super(SplitAttnConv2d, self).__init__()
-        self.radix = radix
-        self.drop_block = drop_block
-        mid_chs = out_channels * radix
-        attn_chs = max(in_channels * radix // reduction_factor, 32)
-
-        self.conv = nn.Conv2d(
-            in_channels, mid_chs, kernel_size, stride, padding, dilation,
-            groups=groups * radix, bias=bias, **kwargs)
-        self.bn0 = norm_layer(mid_chs) if norm_layer is not None else None
-        self.act0 = act_layer(inplace=True)
-        self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups)
-        self.bn1 = norm_layer(attn_chs) if norm_layer is not None else None
-        self.act1 = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, groups=groups)
-        self.rsoftmax = RadixSoftmax(radix, groups)
-
-    @property
-    def in_channels(self):
-        return self.conv.in_channels
-
-    @property
-    def out_channels(self):
-        return self.fc1.out_channels
-
-    def forward(self, x):
-        x = self.conv(x)
-        if self.bn0 is not None:
-            x = self.bn0(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-        x = self.act0(x)
-
-        B, RC, H, W = x.shape
-        if self.radix > 1:
-            x = x.reshape((B, self.radix, RC // self.radix, H, W))
-            x_gap = x.sum(dim=1)
-        else:
-            x_gap = x
-        x_gap = F.adaptive_avg_pool2d(x_gap, 1)
-        x_gap = self.fc1(x_gap)
-        if self.bn1 is not None:
-            x_gap = self.bn1(x_gap)
-        x_gap = self.act1(x_gap)
-        x_attn = self.fc2(x_gap)
-
-        x_attn = self.rsoftmax(x_attn).view(B, -1, 1, 1)
-        if self.radix > 1:
-            out = (x * x_attn.reshape((B, self.radix, RC // self.radix, 1, 1))).sum(dim=1)
-        else:
-            out = x * x_attn
-        return out.contiguous()
diff --git a/AVLFormer/src/timm/models/layers/split_batchnorm.py b/AVLFormer/src/timm/models/layers/split_batchnorm.py
deleted file mode 100644
index 830781b..0000000
--- a/AVLFormer/src/timm/models/layers/split_batchnorm.py
+++ /dev/null
@@ -1,75 +0,0 @@
-""" Split BatchNorm
-
-A PyTorch BatchNorm layer that splits input batch into N equal parts and passes each through
-a separate BN layer. The first split is passed through the parent BN layers with weight/bias
-keys the same as the original BN. All other splits pass through BN sub-layers under the '.aux_bn'
-namespace.
-
-This allows easily removing the auxiliary BN layers after training to efficiently
-achieve the 'Auxiliary BatchNorm' as described in the AdvProp Paper, section 4.2,
-'Disentangled Learning via An Auxiliary BN'
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-
-
-class SplitBatchNorm2d(torch.nn.BatchNorm2d):
-
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
-                 track_running_stats=True, num_splits=2):
-        super().__init__(num_features, eps, momentum, affine, track_running_stats)
-        assert num_splits > 1, 'Should have at least one aux BN layer (num_splits at least 2)'
-        self.num_splits = num_splits
-        self.aux_bn = nn.ModuleList([
-            nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) for _ in range(num_splits - 1)])
-
-    def forward(self, input: torch.Tensor):
-        if self.training:  # aux BN only relevant while training
-            split_size = input.shape[0] // self.num_splits
-            assert input.shape[0] == split_size * self.num_splits, "batch size must be evenly divisible by num_splits"
-            split_input = input.split(split_size)
-            x = [super().forward(split_input[0])]
-            for i, a in enumerate(self.aux_bn):
-                x.append(a(split_input[i + 1]))
-            return torch.cat(x, dim=0)
-        else:
-            return super().forward(input)
-
-
-def convert_splitbn_model(module, num_splits=2):
-    """
-    Recursively traverse module and its children to replace all instances of
-    ``torch.nn.modules.batchnorm._BatchNorm`` with `SplitBatchnorm2d`.
-    Args:
-        module (torch.nn.Module): input module
-        num_splits: number of separate batchnorm layers to split input across
-    Example::
-        >>> # model is an instance of torch.nn.Module
-        >>> model = timm.models.convert_splitbn_model(model, num_splits=2)
-    """
-    mod = module
-    if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm):
-        return module
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
-        mod = SplitBatchNorm2d(
-            module.num_features, module.eps, module.momentum, module.affine,
-            module.track_running_stats, num_splits=num_splits)
-        mod.running_mean = module.running_mean
-        mod.running_var = module.running_var
-        mod.num_batches_tracked = module.num_batches_tracked
-        if module.affine:
-            mod.weight.data = module.weight.data.clone().detach()
-            mod.bias.data = module.bias.data.clone().detach()
-        for aux in mod.aux_bn:
-            aux.running_mean = module.running_mean.clone()
-            aux.running_var = module.running_var.clone()
-            aux.num_batches_tracked = module.num_batches_tracked.clone()
-            if module.affine:
-                aux.weight.data = module.weight.data.clone().detach()
-                aux.bias.data = module.bias.data.clone().detach()
-    for name, child in module.named_children():
-        mod.add_module(name, convert_splitbn_model(child, num_splits=num_splits))
-    del module
-    return mod
diff --git a/AVLFormer/src/timm/models/layers/std_conv.py b/AVLFormer/src/timm/models/layers/std_conv.py
deleted file mode 100644
index b0cb1ee..0000000
--- a/AVLFormer/src/timm/models/layers/std_conv.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .padding import get_padding, get_padding_value, pad_same
-
-
-def get_weight(module):
-    std, mean = torch.std_mean(module.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-    weight = (module.weight - mean) / (std + module.eps)
-    return weight
-
-
-class StdConv2d(nn.Conv2d):
-    """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
-
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
-    """
-    def __init__(
-            self, in_channel, out_channels, kernel_size, stride=1, padding=None, dilation=1,
-            groups=1, bias=False, eps=1e-5):
-        if padding is None:
-            padding = get_padding(kernel_size, stride, dilation)
-        super().__init__(
-            in_channel, out_channels, kernel_size, stride=stride,
-            padding=padding, dilation=dilation, groups=groups, bias=bias)
-        self.eps = eps
-
-    def get_weight(self):
-        std, mean = torch.std_mean(self.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-        weight = (self.weight - mean) / (std + self.eps)
-        return weight
-
-    def forward(self, x):
-        x = F.conv2d(x, self.get_weight(), self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
-class StdConv2dSame(nn.Conv2d):
-    """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
-
-    Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
-        https://arxiv.org/abs/1903.10520v2
-    """
-    def __init__(
-            self, in_channel, out_channels, kernel_size, stride=1, padding='SAME', dilation=1,
-            groups=1, bias=False, eps=1e-5):
-        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
-        super().__init__(
-            in_channel, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.same_pad = is_dynamic
-        self.eps = eps
-
-    def get_weight(self):
-        std, mean = torch.std_mean(self.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-        weight = (self.weight - mean) / (std + self.eps)
-        return weight
-
-    def forward(self, x):
-        if self.same_pad:
-            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
-        x = F.conv2d(x, self.get_weight(), self.bias, self.stride, self.padding, self.dilation, self.groups)
-        return x
-
-
-class ScaledStdConv2d(nn.Conv2d):
-    """Conv2d layer with Scaled Weight Standardization.
-
-    Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
-        https://arxiv.org/abs/2101.08692
-
-    NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
-    """
-
-    def __init__(
-            self, in_channels, out_channels, kernel_size, stride=1, padding=None, dilation=1, groups=1,
-            bias=True, gamma=1.0, eps=1e-5, gain_init=1.0, use_layernorm=False):
-        if padding is None:
-            padding = get_padding(kernel_size, stride, dilation)
-        super().__init__(
-            in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init))
-        self.scale = gamma * self.weight[0].numel() ** -0.5  # gamma * 1 / sqrt(fan-in)
-        self.eps = eps ** 2 if use_layernorm else eps
-        self.use_layernorm = use_layernorm  # experimental, slightly faster/less GPU memory to hijack LN kernel
-
-    def get_weight(self):
-        if self.use_layernorm:
-            weight = self.scale * F.layer_norm(self.weight, self.weight.shape[1:], eps=self.eps)
-        else:
-            std, mean = torch.std_mean(self.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-            weight = self.scale * (self.weight - mean) / (std + self.eps)
-        return self.gain * weight
-
-    def forward(self, x):
-        return F.conv2d(x, self.get_weight(), self.bias, self.stride, self.padding, self.dilation, self.groups)
-
-
-class ScaledStdConv2dSame(nn.Conv2d):
-    """Conv2d layer with Scaled Weight Standardization and Tensorflow-like SAME padding support
-
-    Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
-        https://arxiv.org/abs/2101.08692
-
-    NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
-    """
-
-    def __init__(
-            self, in_channels, out_channels, kernel_size, stride=1, padding='SAME', dilation=1, groups=1,
-            bias=True, gamma=1.0, eps=1e-5, gain_init=1.0, use_layernorm=False):
-        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
-        super().__init__(
-            in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
-            groups=groups, bias=bias)
-        self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init))
-        self.scale = gamma * self.weight[0].numel() ** -0.5
-        self.same_pad = is_dynamic
-        self.eps = eps ** 2 if use_layernorm else eps
-        self.use_layernorm = use_layernorm  # experimental, slightly faster/less GPU memory to hijack LN kernel
-
-    # NOTE an alternate formulation to consider, closer to DeepMind Haiku impl but doesn't seem
-    # to make much numerical difference (+/- .002 to .004) in top-1 during eval.
-    # def get_weight(self):
-    #         var, mean = torch.var_mean(self.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-    #         scale = torch.rsqrt((self.weight[0].numel() * var).clamp_(self.eps)) * self.gain
-    #         weight = (self.weight - mean) * scale
-    #     return self.gain * weight
-
-    def get_weight(self):
-        if self.use_layernorm:
-            weight = self.scale * F.layer_norm(self.weight, self.weight.shape[1:], eps=self.eps)
-        else:
-            std, mean = torch.std_mean(self.weight, dim=[1, 2, 3], keepdim=True, unbiased=False)
-            weight = self.scale * (self.weight - mean) / (std + self.eps)
-        return self.gain * weight
-
-    def forward(self, x):
-        if self.same_pad:
-            x = pad_same(x, self.kernel_size, self.stride, self.dilation)
-        return F.conv2d(x, self.get_weight(), self.bias, self.stride, self.padding, self.dilation, self.groups)
diff --git a/AVLFormer/src/timm/models/layers/test_time_pool.py b/AVLFormer/src/timm/models/layers/test_time_pool.py
deleted file mode 100644
index 98c0bf5..0000000
--- a/AVLFormer/src/timm/models/layers/test_time_pool.py
+++ /dev/null
@@ -1,52 +0,0 @@
-""" Test Time Pooling (Average-Max Pool)
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import logging
-from torch import nn
-import torch.nn.functional as F
-
-from .adaptive_avgmax_pool import adaptive_avgmax_pool2d
-
-
-_logger = logging.getLogger(__name__)
-
-
-class TestTimePoolHead(nn.Module):
-    def __init__(self, base, original_pool=7):
-        super(TestTimePoolHead, self).__init__()
-        self.base = base
-        self.original_pool = original_pool
-        base_fc = self.base.get_classifier()
-        if isinstance(base_fc, nn.Conv2d):
-            self.fc = base_fc
-        else:
-            self.fc = nn.Conv2d(
-                self.base.num_features, self.base.num_classes, kernel_size=1, bias=True)
-            self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size()))
-            self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size()))
-        self.base.reset_classifier(0)  # delete original fc layer
-
-    def forward(self, x):
-        x = self.base.forward_features(x)
-        x = F.avg_pool2d(x, kernel_size=self.original_pool, stride=1)
-        x = self.fc(x)
-        x = adaptive_avgmax_pool2d(x, 1)
-        return x.view(x.size(0), -1)
-
-
-def apply_test_time_pool(model, config, use_test_size=True):
-    test_time_pool = False
-    if not hasattr(model, 'default_cfg') or not model.default_cfg:
-        return model, False
-    if use_test_size and 'test_input_size' in model.default_cfg:
-        df_input_size = model.default_cfg['test_input_size']
-    else:
-        df_input_size = model.default_cfg['input_size']
-    if config['input_size'][-1] > df_input_size[-1] and config['input_size'][-2] > df_input_size[-2]:
-        _logger.info('Target input size %s > pretrained default %s, using test time pooling' %
-                     (str(config['input_size'][-2:]), str(df_input_size[-2:])))
-        model = TestTimePoolHead(model, original_pool=model.default_cfg['pool_size'])
-        test_time_pool = True
-    return model, test_time_pool
diff --git a/AVLFormer/src/timm/models/layers/weight_init.py b/AVLFormer/src/timm/models/layers/weight_init.py
deleted file mode 100644
index 305a2fd..0000000
--- a/AVLFormer/src/timm/models/layers/weight_init.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import torch
-import math
-import warnings
-
-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                      "The distribution of values may be incorrect.",
-                      stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
-
-def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == 'fan_in':
-        denom = fan_in
-    elif mode == 'fan_out':
-        denom = fan_out
-    elif mode == 'fan_avg':
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
-    elif distribution == "normal":
-        tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
diff --git a/AVLFormer/src/timm/models/mobilenetv3.py b/AVLFormer/src/timm/models/mobilenetv3.py
deleted file mode 100644
index d565f50..0000000
--- a/AVLFormer/src/timm/models/mobilenetv3.py
+++ /dev/null
@@ -1,445 +0,0 @@
-
-""" MobileNet V3
-
-A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
-
-Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from typing import List
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
-from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
-from .features import FeatureInfo, FeatureHooks
-from .helpers import build_model_with_cfg, default_cfg_for_features
-from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, hard_sigmoid
-from .registry import register_model
-
-__all__ = ['MobileNetV3']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv_stem', 'classifier': 'classifier',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'mobilenetv3_large_075': _cfg(url=''),
-    'mobilenetv3_large_100': _cfg(
-        interpolation='bicubic',
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth'),
-    'mobilenetv3_small_075': _cfg(url=''),
-    'mobilenetv3_small_100': _cfg(url=''),
-    'mobilenetv3_rw': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
-        interpolation='bicubic'),
-    'tf_mobilenetv3_large_075': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_mobilenetv3_large_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_mobilenetv3_large_minimal_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_mobilenetv3_small_075': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_mobilenetv3_small_100': _cfg(
-        url= 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-    'tf_mobilenetv3_small_minimal_100': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
-        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
-}
-
-_DEBUG = False
-
-
-class MobileNetV3(nn.Module):
-    """ MobiletNet-V3
-
-    Based on my EfficientNet implementation and building blocks, this model utilizes the MobileNet-v3 specific
-    'efficient head', where global pooling is done before the head convolution without a final batch-norm
-    layer before the classifier.
-
-    Paper: https://arxiv.org/abs/1905.02244
-    """
-
-    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
-                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
-                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
-        super(MobileNetV3, self).__init__()
-
-        self.num_classes = num_classes
-        self.num_features = num_features
-        self.drop_rate = drop_rate
-
-        # Stem
-        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Middle stages (IR/ER/DS Blocks)
-        builder = EfficientNetBuilder(
-            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
-        self.blocks = nn.Sequential(*builder(stem_size, block_args))
-        self.feature_info = builder.features
-        head_chs = builder.in_chs
-
-        # Head + Pooling
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        num_pooled_chs = head_chs * self.global_pool.feat_mult()
-        self.conv_head = create_conv2d(num_pooled_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
-        self.act2 = act_layer(inplace=True)
-        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-        efficientnet_init_weights(self)
-
-    def as_sequential(self):
-        layers = [self.conv_stem, self.bn1, self.act1]
-        layers.extend(self.blocks)
-        layers.extend([self.global_pool, self.conv_head, self.act2])
-        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
-        return nn.Sequential(*layers)
-
-    def get_classifier(self):
-        return self.classifier
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        # cannot meaningfully change pooling of efficient head after creation
-        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.blocks(x)
-        x = self.global_pool(x)
-        x = self.conv_head(x)
-        x = self.act2(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        if not self.global_pool.is_identity():
-            x = x.flatten(1)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        return self.classifier(x)
-
-
-class MobileNetV3Features(nn.Module):
-    """ MobileNetV3 Feature Extractor
-
-    A work-in-progress feature extraction module for MobileNet-V3 to use as a backbone for segmentation
-    and object detection models.
-    """
-
-    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck',
-                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
-                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
-        super(MobileNetV3Features, self).__init__()
-        norm_kwargs = norm_kwargs or {}
-        self.drop_rate = drop_rate
-
-        # Stem
-        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size, **norm_kwargs)
-        self.act1 = act_layer(inplace=True)
-
-        # Middle stages (IR/ER/DS Blocks)
-        builder = EfficientNetBuilder(
-            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
-        self.blocks = nn.Sequential(*builder(stem_size, block_args))
-        self.feature_info = FeatureInfo(builder.features, out_indices)
-        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
-
-        efficientnet_init_weights(self)
-
-        # Register feature extraction hooks with FeatureHooks helper
-        self.feature_hooks = None
-        if feature_location != 'bottleneck':
-            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
-            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
-
-    def forward(self, x) -> List[torch.Tensor]:
-        x = self.conv_stem(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        if self.feature_hooks is None:
-            features = []
-            if 0 in self._stage_out_idx:
-                features.append(x)  # add stem out
-            for i, b in enumerate(self.blocks):
-                x = b(x)
-                if i + 1 in self._stage_out_idx:
-                    features.append(x)
-            return features
-        else:
-            self.blocks(x)
-            out = self.feature_hooks.get_output(x.device)
-            return list(out.values())
-
-
-def _create_mnv3(variant, pretrained=False, **kwargs):
-    features_only = False
-    model_cls = MobileNetV3
-    kwargs_filter = None
-    if kwargs.pop('features_only', False):
-        features_only = True
-        kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'head_bias', 'global_pool')
-        model_cls = MobileNetV3Features
-    model = build_model_with_cfg(
-        model_cls, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        pretrained_strict=not features_only,
-        kwargs_filter=kwargs_filter,
-        **kwargs)
-    if features_only:
-        model.default_cfg = default_cfg_for_features(model.default_cfg)
-    return model
-
-
-def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a MobileNet-V3 model.
-
-    Ref impl: ?
-    Paper: https://arxiv.org/abs/1905.02244
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    arch_def = [
-        # stage 0, 112x112 in
-        ['ds_r1_k3_s1_e1_c16_nre_noskip'],  # relu
-        # stage 1, 112x112 in
-        ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
-        # stage 2, 56x56 in
-        ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
-        # stage 3, 28x28 in
-        ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
-        # stage 4, 14x14in
-        ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
-        # stage 5, 14x14in
-        ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
-        # stage 6, 7x7 in
-        ['cn_r1_k1_s1_c960'],  # hard-swish
-    ]
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        head_bias=False,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
-        se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=1),
-        **kwargs,
-    )
-    model = _create_mnv3(variant, pretrained, **model_kwargs)
-    return model
-
-
-def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
-    """Creates a MobileNet-V3 model.
-
-    Ref impl: ?
-    Paper: https://arxiv.org/abs/1905.02244
-
-    Args:
-      channel_multiplier: multiplier to number of channels per layer.
-    """
-    if 'small' in variant:
-        num_features = 1024
-        if 'minimal' in variant:
-            act_layer = resolve_act_layer(kwargs, 'relu')
-            arch_def = [
-                # stage 0, 112x112 in
-                ['ds_r1_k3_s2_e1_c16'],
-                # stage 1, 56x56 in
-                ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
-                # stage 2, 28x28 in
-                ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
-                # stage 3, 14x14 in
-                ['ir_r2_k3_s1_e3_c48'],
-                # stage 4, 14x14in
-                ['ir_r3_k3_s2_e6_c96'],
-                # stage 6, 7x7 in
-                ['cn_r1_k1_s1_c576'],
-            ]
-        else:
-            act_layer = resolve_act_layer(kwargs, 'hard_swish')
-            arch_def = [
-                # stage 0, 112x112 in
-                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
-                # stage 1, 56x56 in
-                ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'],  # relu
-                # stage 2, 28x28 in
-                ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'],  # hard-swish
-                # stage 3, 14x14 in
-                ['ir_r2_k5_s1_e3_c48_se0.25'],  # hard-swish
-                # stage 4, 14x14in
-                ['ir_r3_k5_s2_e6_c96_se0.25'],  # hard-swish
-                # stage 6, 7x7 in
-                ['cn_r1_k1_s1_c576'],  # hard-swish
-            ]
-    else:
-        num_features = 1280
-        if 'minimal' in variant:
-            act_layer = resolve_act_layer(kwargs, 'relu')
-            arch_def = [
-                # stage 0, 112x112 in
-                ['ds_r1_k3_s1_e1_c16'],
-                # stage 1, 112x112 in
-                ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
-                # stage 2, 56x56 in
-                ['ir_r3_k3_s2_e3_c40'],
-                # stage 3, 28x28 in
-                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
-                # stage 4, 14x14in
-                ['ir_r2_k3_s1_e6_c112'],
-                # stage 5, 14x14in
-                ['ir_r3_k3_s2_e6_c160'],
-                # stage 6, 7x7 in
-                ['cn_r1_k1_s1_c960'],
-            ]
-        else:
-            act_layer = resolve_act_layer(kwargs, 'hard_swish')
-            arch_def = [
-                # stage 0, 112x112 in
-                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
-                # stage 1, 112x112 in
-                ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
-                # stage 2, 56x56 in
-                ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
-                # stage 3, 28x28 in
-                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
-                # stage 4, 14x14in
-                ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
-                # stage 5, 14x14in
-                ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
-                # stage 6, 7x7 in
-                ['cn_r1_k1_s1_c960'],  # hard-swish
-            ]
-
-    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def),
-        num_features=num_features,
-        stem_size=16,
-        channel_multiplier=channel_multiplier,
-        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=act_layer,
-        se_kwargs=dict(act_layer=nn.ReLU, gate_fn=hard_sigmoid, reduce_mid=True, divisor=8),
-        **kwargs,
-    )
-    model = _create_mnv3(variant, pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_large_075(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_large_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_small_075(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_small_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def mobilenetv3_rw(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    if pretrained:
-        # pretrained model trained with non-default BN epsilon
-        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_large_075(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_large_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_small_075(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_small_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
-
-
-@register_model
-def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
-    """ MobileNet V3 """
-    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
-    kwargs['pad_type'] = 'same'
-    model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/nasnet.py b/AVLFormer/src/timm/models/nasnet.py
deleted file mode 100644
index 2afe82c..0000000
--- a/AVLFormer/src/timm/models/nasnet.py
+++ /dev/null
@@ -1,567 +0,0 @@
-""" NasNet-A (Large)
- nasnetalarge implementation grabbed from Cadene's pretrained models
- https://github.com/Cadene/pretrained-models.pytorch
-"""
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, create_conv2d, create_pool2d, create_classifier
-from .registry import register_model
-
-__all__ = ['NASNetALarge']
-
-default_cfgs = {
-    'nasnetalarge': {
-        'url': 'http://data.lip6.fr/cadene/pretrainedmodels/nasnetalarge-a1897284.pth',
-        'input_size': (3, 331, 331),
-        'pool_size': (11, 11),
-        'crop_pct': 0.911,
-        'interpolation': 'bicubic',
-        'mean': (0.5, 0.5, 0.5),
-        'std': (0.5, 0.5, 0.5),
-        'num_classes': 1000,
-        'first_conv': 'conv0.conv',
-        'classifier': 'last_linear',
-        'label_offset': 1,  # 1001 classes in pretrained weights
-    },
-}
-
-
-class ActConvBn(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''):
-        super(ActConvBn, self).__init__()
-        self.act = nn.ReLU()
-        self.conv = create_conv2d(
-            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1)
-
-    def forward(self, x):
-        x = self.act(x)
-        x = self.conv(x)
-        x = self.bn(x)
-        return x
-
-
-class SeparableConv2d(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''):
-        super(SeparableConv2d, self).__init__()
-        self.depthwise_conv2d = create_conv2d(
-            in_channels, in_channels, kernel_size=kernel_size,
-            stride=stride, padding=padding, groups=in_channels)
-        self.pointwise_conv2d = create_conv2d(
-            in_channels, out_channels, kernel_size=1, padding=0)
-
-    def forward(self, x):
-        x = self.depthwise_conv2d(x)
-        x = self.pointwise_conv2d(x)
-        return x
-
-
-class BranchSeparables(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_type='', stem_cell=False):
-        super(BranchSeparables, self).__init__()
-        middle_channels = out_channels if stem_cell else in_channels
-        self.act_1 = nn.ReLU()
-        self.separable_1 = SeparableConv2d(
-            in_channels, middle_channels, kernel_size, stride=stride, padding=pad_type)
-        self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001, momentum=0.1)
-        self.act_2 = nn.ReLU(inplace=True)
-        self.separable_2 = SeparableConv2d(
-            middle_channels, out_channels, kernel_size, stride=1, padding=pad_type)
-        self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1)
-
-    def forward(self, x):
-        x = self.act_1(x)
-        x = self.separable_1(x)
-        x = self.bn_sep_1(x)
-        x = self.act_2(x)
-        x = self.separable_2(x)
-        x = self.bn_sep_2(x)
-        return x
-
-
-class CellStem0(nn.Module):
-    def __init__(self, stem_size, num_channels=42, pad_type=''):
-        super(CellStem0, self).__init__()
-        self.num_channels = num_channels
-        self.stem_size = stem_size
-        self.conv_1x1 = ActConvBn(self.stem_size, self.num_channels, 1, stride=1)
-
-        self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
-        self.comb_iter_0_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True)
-
-        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
-        self.comb_iter_1_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(self.stem_size, self.num_channels, 5, 2, pad_type, stem_cell=True)
-
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type)
-        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
-
-    def forward(self, x):
-        x1 = self.conv_1x1(x)
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x1)
-        x_comb_iter_0_right = self.comb_iter_0_right(x)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x1)
-        x_comb_iter_1_right = self.comb_iter_1_right(x)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x1)
-        x_comb_iter_2_right = self.comb_iter_2_right(x)
-        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
-
-        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
-        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
-        x_comb_iter_4_right = self.comb_iter_4_right(x1)
-        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
-
-        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class CellStem1(nn.Module):
-
-    def __init__(self, stem_size, num_channels, pad_type=''):
-        super(CellStem1, self).__init__()
-        self.num_channels = num_channels
-        self.stem_size = stem_size
-        self.conv_1x1 = ActConvBn(2 * self.num_channels, self.num_channels, 1, stride=1)
-
-        self.act = nn.ReLU()
-        self.path_1 = nn.Sequential()
-        self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_1.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))
-       
-        self.path_2 = nn.Sequential()
-        self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1)))
-        self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_2.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))
-
-        self.final_path_bn = nn.BatchNorm2d(self.num_channels, eps=0.001, momentum=0.1)
-
-        self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
-        self.comb_iter_0_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type)
-
-        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
-        self.comb_iter_1_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type)
-
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type)
-        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
-
-    def forward(self, x_conv0, x_stem_0):
-        x_left = self.conv_1x1(x_stem_0)
-
-        x_relu = self.act(x_conv0)
-        # path 1
-        x_path1 = self.path_1(x_relu)
-        # path 2
-        x_path2 = self.path_2(x_relu)
-        # final path
-        x_right = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_right)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_left)
-        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
-        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
-
-        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
-        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
-        x_comb_iter_4_right = self.comb_iter_4_right(x_left)
-        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
-
-        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class FirstCell(nn.Module):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
-        super(FirstCell, self).__init__()
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1)
-
-        self.act = nn.ReLU()
-        self.path_1 = nn.Sequential()
-        self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_1.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False))
-
-        self.path_2 = nn.Sequential()
-        self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1)))
-        self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_2.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False))
-
-        self.final_path_bn = nn.BatchNorm2d(out_chs_left * 2, eps=0.001, momentum=0.1)
-
-        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
-        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-
-        self.comb_iter_1_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
-        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-
-    def forward(self, x, x_prev):
-        x_relu = self.act(x_prev)
-        x_path1 = self.path_1(x_relu)
-        x_path2 = self.path_2(x_relu)
-        x_left = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
-        x_right = self.conv_1x1(x)
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
-        x_comb_iter_2 = x_comb_iter_2_left + x_left
-
-        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
-        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
-        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
-        x_comb_iter_4 = x_comb_iter_4_left + x_right
-
-        x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class NormalCell(nn.Module):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
-        super(NormalCell, self).__init__()
-        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
-
-        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type)
-        self.comb_iter_0_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type)
-
-        self.comb_iter_1_left = BranchSeparables(out_chs_left, out_chs_left, 5, 1, pad_type)
-        self.comb_iter_1_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-
-    def forward(self, x, x_prev):
-        x_left = self.conv_prev_1x1(x_prev)
-        x_right = self.conv_1x1(x)
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
-        x_comb_iter_2 = x_comb_iter_2_left + x_left
-
-        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
-        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
-        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
-        x_comb_iter_4 = x_comb_iter_4_left + x_right
-
-        x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class ReductionCell0(nn.Module):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
-        super(ReductionCell0, self).__init__()
-        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
-
-        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
-        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
-
-        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
-        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
-
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
-
-    def forward(self, x, x_prev):
-        x_left = self.conv_prev_1x1(x_prev)
-        x_right = self.conv_1x1(x)
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
-        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
-        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
-
-        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
-        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
-        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
-        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
-
-        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class ReductionCell1(nn.Module):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
-        super(ReductionCell1, self).__init__()
-        self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type)
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type)
-
-        self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
-        self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
-
-        self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type)
-        self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type)
-
-        self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type)
-
-        self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type)
-        self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type)
-
-    def forward(self, x, x_prev):
-        x_left = self.conv_prev_1x1(x_prev)
-        x_right = self.conv_1x1(x)
-
-        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
-        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
-        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
-
-        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
-        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
-        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
-        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
-
-        x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class NASNetALarge(nn.Module):
-    """NASNetALarge (6 @ 4032) """
-
-    def __init__(self, num_classes=1000, in_chans=3, stem_size=96, channel_multiplier=2,
-                 num_features=4032, output_stride=32, drop_rate=0., global_pool='avg', pad_type='same'):
-        super(NASNetALarge, self).__init__()
-        self.num_classes = num_classes
-        self.stem_size = stem_size
-        self.num_features = num_features
-        self.channel_multiplier = channel_multiplier
-        self.drop_rate = drop_rate
-        assert output_stride == 32
-
-        channels = self.num_features // 24
-        # 24 is default value for the architecture
-
-        self.conv0 = ConvBnAct(
-            in_channels=in_chans, out_channels=self.stem_size, kernel_size=3, padding=0, stride=2,
-            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)
-
-        self.cell_stem_0 = CellStem0(
-            self.stem_size, num_channels=channels // (channel_multiplier ** 2), pad_type=pad_type)
-        self.cell_stem_1 = CellStem1(
-            self.stem_size, num_channels=channels // channel_multiplier, pad_type=pad_type)
-
-        self.cell_0 = FirstCell(
-            in_chs_left=channels, out_chs_left=channels // 2,
-            in_chs_right=2 * channels, out_chs_right=channels, pad_type=pad_type)
-        self.cell_1 = NormalCell(
-            in_chs_left=2 * channels, out_chs_left=channels,
-            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
-        self.cell_2 = NormalCell(
-            in_chs_left=6 * channels, out_chs_left=channels,
-            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
-        self.cell_3 = NormalCell(
-            in_chs_left=6 * channels, out_chs_left=channels,
-            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
-        self.cell_4 = NormalCell(
-            in_chs_left=6 * channels, out_chs_left=channels,
-            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
-        self.cell_5 = NormalCell(
-            in_chs_left=6 * channels, out_chs_left=channels,
-            in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type)
-
-        self.reduction_cell_0 = ReductionCell0(
-            in_chs_left=6 * channels, out_chs_left=2 * channels,
-            in_chs_right=6 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_6 = FirstCell(
-            in_chs_left=6 * channels, out_chs_left=channels,
-            in_chs_right=8 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_7 = NormalCell(
-            in_chs_left=8 * channels, out_chs_left=2 * channels,
-            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_8 = NormalCell(
-            in_chs_left=12 * channels, out_chs_left=2 * channels,
-            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_9 = NormalCell(
-            in_chs_left=12 * channels, out_chs_left=2 * channels,
-            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_10 = NormalCell(
-            in_chs_left=12 * channels, out_chs_left=2 * channels,
-            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-        self.cell_11 = NormalCell(
-            in_chs_left=12 * channels, out_chs_left=2 * channels,
-            in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type)
-
-        self.reduction_cell_1 = ReductionCell1(
-            in_chs_left=12 * channels, out_chs_left=4 * channels,
-            in_chs_right=12 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_12 = FirstCell(
-            in_chs_left=12 * channels, out_chs_left=2 * channels,
-            in_chs_right=16 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_13 = NormalCell(
-            in_chs_left=16 * channels, out_chs_left=4 * channels,
-            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_14 = NormalCell(
-            in_chs_left=24 * channels, out_chs_left=4 * channels,
-            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_15 = NormalCell(
-            in_chs_left=24 * channels, out_chs_left=4 * channels,
-            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_16 = NormalCell(
-            in_chs_left=24 * channels, out_chs_left=4 * channels,
-            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.cell_17 = NormalCell(
-            in_chs_left=24 * channels, out_chs_left=4 * channels,
-            in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type)
-        self.act = nn.ReLU(inplace=True)
-        self.feature_info = [
-            dict(num_chs=96, reduction=2, module='conv0'),
-            dict(num_chs=168, reduction=4, module='cell_stem_1.conv_1x1.act'),
-            dict(num_chs=1008, reduction=8, module='reduction_cell_0.conv_1x1.act'),
-            dict(num_chs=2016, reduction=16, module='reduction_cell_1.conv_1x1.act'),
-            dict(num_chs=4032, reduction=32, module='act'),
-        ]
-
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def get_classifier(self):
-        return self.last_linear
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x_conv0 = self.conv0(x)
-
-        x_stem_0 = self.cell_stem_0(x_conv0)
-        x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)
-
-        x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
-        x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
-        x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
-        x_cell_3 = self.cell_3(x_cell_2, x_cell_1)
-        x_cell_4 = self.cell_4(x_cell_3, x_cell_2)
-        x_cell_5 = self.cell_5(x_cell_4, x_cell_3)
-
-        x_reduction_cell_0 = self.reduction_cell_0(x_cell_5, x_cell_4)
-        x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_4)
-        x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
-        x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
-        x_cell_9 = self.cell_9(x_cell_8, x_cell_7)
-        x_cell_10 = self.cell_10(x_cell_9, x_cell_8)
-        x_cell_11 = self.cell_11(x_cell_10, x_cell_9)
-
-        x_reduction_cell_1 = self.reduction_cell_1(x_cell_11, x_cell_10)
-        x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_10)
-        x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
-        x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
-        x_cell_15 = self.cell_15(x_cell_14, x_cell_13)
-        x_cell_16 = self.cell_16(x_cell_15, x_cell_14)
-        x_cell_17 = self.cell_17(x_cell_16, x_cell_15)
-        x = self.act(x_cell_17)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, self.drop_rate, training=self.training)
-        x = self.last_linear(x)
-        return x
-
-
-def _create_nasnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        NASNetALarge, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(feature_cls='hook', no_rewrite=True),  # not possible to re-write this model
-        **kwargs)
-
-
-@register_model
-def nasnetalarge(pretrained=False, **kwargs):
-    """NASNet-A large model architecture.
-    """
-    model_kwargs = dict(pad_type='same', **kwargs)
-    return _create_nasnet('nasnetalarge', pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/nfnet.py b/AVLFormer/src/timm/models/nfnet.py
deleted file mode 100644
index c4d6d06..0000000
--- a/AVLFormer/src/timm/models/nfnet.py
+++ /dev/null
@@ -1,937 +0,0 @@
-""" Normalization Free Nets. NFNet, NF-RegNet, NF-ResNet (pre-activation) Models
-
-Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-    - https://arxiv.org/abs/2101.08692
-
-Paper: `High-Performance Large-Scale Image Recognition Without Normalization`
-    - https://arxiv.org/abs/2102.06171
-
-Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets
-
-Status:
-* These models are a work in progress, experiments ongoing.
-* Pretrained weights for two models so far, more to come.
-* Model details updated to closer match official JAX code now that it's released
-* NF-ResNet, NF-RegNet-B, and NFNet-F models supported
-
-Hacked together by / copyright Ross Wightman, 2021.
-"""
-import math
-from dataclasses import dataclass, field
-from collections import OrderedDict
-from typing import Tuple, Optional
-from functools import partial
-
-import torch
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .registry import register_model
-from .layers import ClassifierHead, DropPath, AvgPool2dSame, ScaledStdConv2d, ScaledStdConv2dSame,\
-    get_act_layer, get_act_fn, get_attn, make_divisible
-
-
-def _dcfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.9, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv1', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = dict(
-    dm_nfnet_f0=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f0-604f9c3a.pth',
-        pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), crop_pct=.9),
-    dm_nfnet_f1=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f1-fc540f82.pth',
-        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320), crop_pct=0.91),
-    dm_nfnet_f2=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f2-89875923.pth',
-        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352), crop_pct=0.92),
-    dm_nfnet_f3=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f3-d74ab3aa.pth',
-        pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416), crop_pct=0.94),
-    dm_nfnet_f4=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f4-0ac5b10b.pth',
-        pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512), crop_pct=0.951),
-    dm_nfnet_f5=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f5-ecb20ab1.pth',
-        pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544), crop_pct=0.954),
-    dm_nfnet_f6=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f6-e0f12116.pth',
-        pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576), crop_pct=0.956),
-
-    nfnet_f0=_dcfg(
-        url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256)),
-    nfnet_f1=_dcfg(
-        url='', pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320)),
-    nfnet_f2=_dcfg(
-        url='', pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352)),
-    nfnet_f3=_dcfg(
-        url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416)),
-    nfnet_f4=_dcfg(
-        url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512)),
-    nfnet_f5=_dcfg(
-        url='', pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544)),
-    nfnet_f6=_dcfg(
-        url='', pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576)),
-    nfnet_f7=_dcfg(
-        url='', pool_size=(15, 15), input_size=(3, 480, 480), test_input_size=(3, 608, 608)),
-
-    nfnet_f0s=_dcfg(
-        url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256)),
-    nfnet_f1s=_dcfg(
-        url='', pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320)),
-    nfnet_f2s=_dcfg(
-        url='', pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352)),
-    nfnet_f3s=_dcfg(
-        url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416)),
-    nfnet_f4s=_dcfg(
-        url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512)),
-    nfnet_f5s=_dcfg(
-        url='', pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544)),
-    nfnet_f6s=_dcfg(
-        url='', pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576)),
-    nfnet_f7s=_dcfg(
-        url='', pool_size=(15, 15), input_size=(3, 480, 480), test_input_size=(3, 608, 608)),
-
-    nfnet_l0=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nfnet_l0_ra2-45c6688d.pth',
-        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0),
-    eca_nfnet_l0=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l0_ra2-e3e9ac50.pth',
-        hf_hub='timm/eca_nfnet_l0',
-        pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0),
-    eca_nfnet_l1=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l1_ra2-7dce93cd.pth',
-        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 320, 320), crop_pct=1.0),
-
-    nf_regnet_b0=_dcfg(
-        url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), first_conv='stem.conv'),
-    nf_regnet_b1=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_regnet_b1_256_ra2-ad85cfef.pth',
-        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), first_conv='stem.conv'),  # NOT to paper spec
-    nf_regnet_b2=_dcfg(
-        url='', pool_size=(8, 8), input_size=(3, 240, 240), test_input_size=(3, 272, 272), first_conv='stem.conv'),
-    nf_regnet_b3=_dcfg(
-        url='', pool_size=(9, 9), input_size=(3, 288, 288), test_input_size=(3, 320, 320), first_conv='stem.conv'),
-    nf_regnet_b4=_dcfg(
-        url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 384, 384), first_conv='stem.conv'),
-    nf_regnet_b5=_dcfg(
-        url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 456, 456), first_conv='stem.conv'),
-
-    nf_resnet26=_dcfg(url='', first_conv='stem.conv'),
-    nf_resnet50=_dcfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_resnet50_ra2-9f236009.pth',
-        pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), crop_pct=0.94, first_conv='stem.conv'),
-    nf_resnet101=_dcfg(url='', first_conv='stem.conv'),
-
-    nf_seresnet26=_dcfg(url='', first_conv='stem.conv'),
-    nf_seresnet50=_dcfg(url='', first_conv='stem.conv'),
-    nf_seresnet101=_dcfg(url='', first_conv='stem.conv'),
-
-    nf_ecaresnet26=_dcfg(url='', first_conv='stem.conv'),
-    nf_ecaresnet50=_dcfg(url='', first_conv='stem.conv'),
-    nf_ecaresnet101=_dcfg(url='', first_conv='stem.conv'),
-)
-
-
-@dataclass
-class NfCfg:
-    depths: Tuple[int, int, int, int]
-    channels: Tuple[int, int, int, int]
-    alpha: float = 0.2
-    stem_type: str = '3x3'
-    stem_chs: Optional[int] = None
-    group_size: Optional[int] = None
-    attn_layer: Optional[str] = None
-    attn_kwargs: dict = None
-    attn_gain: float = 2.0  # NF correction gain to apply if attn layer is used
-    width_factor: float = 1.0
-    bottle_ratio: float = 0.5
-    num_features: int = 0  # num out_channels for final conv, no final_conv if 0
-    ch_div: int = 8  # round channels % 8 == 0 to keep tensor-core use optimal
-    reg: bool = False  # enables EfficientNet-like options used in RegNet variants, expand from in_chs, se in middle
-    extra_conv: bool = False  # extra 3x3 bottleneck convolution for NFNet models
-    gamma_in_act: bool = False
-    same_padding: bool = False
-    skipinit: bool = False  # disabled by default, non-trivial performance impact
-    zero_init_fc: bool = False
-    act_layer: str = 'silu'
-
-
-def _nfres_cfg(
-        depths, channels=(256, 512, 1024, 2048), group_size=None, act_layer='relu', attn_layer=None, attn_kwargs=None):
-    attn_kwargs = attn_kwargs or {}
-    cfg = NfCfg(
-        depths=depths, channels=channels, stem_type='7x7_pool', stem_chs=64, bottle_ratio=0.25,
-        group_size=group_size, act_layer=act_layer, attn_layer=attn_layer, attn_kwargs=attn_kwargs)
-    return cfg
-
-
-def _nfreg_cfg(depths, channels=(48, 104, 208, 440)):
-    num_features = 1280 * channels[-1] // 440
-    attn_kwargs = dict(reduction_ratio=0.5, divisor=8)
-    cfg = NfCfg(
-        depths=depths, channels=channels, stem_type='3x3', group_size=8, width_factor=0.75, bottle_ratio=2.25,
-        num_features=num_features, reg=True, attn_layer='se', attn_kwargs=attn_kwargs)
-    return cfg
-
-
-def _nfnet_cfg(
-        depths, channels=(256, 512, 1536, 1536), group_size=128, bottle_ratio=0.5, feat_mult=2.,
-        act_layer='gelu', attn_layer='se', attn_kwargs=None):
-    num_features = int(channels[-1] * feat_mult)
-    attn_kwargs = attn_kwargs if attn_kwargs is not None else dict(reduction_ratio=0.5, divisor=8)
-    cfg = NfCfg(
-        depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=group_size,
-        bottle_ratio=bottle_ratio, extra_conv=True, num_features=num_features, act_layer=act_layer,
-        attn_layer=attn_layer, attn_kwargs=attn_kwargs)
-    return cfg
-
-
-def _dm_nfnet_cfg(depths, channels=(256, 512, 1536, 1536), act_layer='gelu', skipinit=True):
-    attn_kwargs = dict(reduction_ratio=0.5, divisor=8)
-    cfg = NfCfg(
-        depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=128,
-        bottle_ratio=0.5, extra_conv=True, gamma_in_act=True, same_padding=True, skipinit=skipinit,
-        num_features=int(channels[-1] * 2.0), act_layer=act_layer, attn_layer='se', attn_kwargs=attn_kwargs)
-    return cfg
-
-
-model_cfgs = dict(
-    # NFNet-F models w/ GELU compatible with DeepMind weights
-    dm_nfnet_f0=_dm_nfnet_cfg(depths=(1, 2, 6, 3)),
-    dm_nfnet_f1=_dm_nfnet_cfg(depths=(2, 4, 12, 6)),
-    dm_nfnet_f2=_dm_nfnet_cfg(depths=(3, 6, 18, 9)),
-    dm_nfnet_f3=_dm_nfnet_cfg(depths=(4, 8, 24, 12)),
-    dm_nfnet_f4=_dm_nfnet_cfg(depths=(5, 10, 30, 15)),
-    dm_nfnet_f5=_dm_nfnet_cfg(depths=(6, 12, 36, 18)),
-    dm_nfnet_f6=_dm_nfnet_cfg(depths=(7, 14, 42, 21)),
-
-    # NFNet-F models w/ GELU (I will likely deprecate/remove these models and just keep dm_ ver for GELU)
-    nfnet_f0=_nfnet_cfg(depths=(1, 2, 6, 3)),
-    nfnet_f1=_nfnet_cfg(depths=(2, 4, 12, 6)),
-    nfnet_f2=_nfnet_cfg(depths=(3, 6, 18, 9)),
-    nfnet_f3=_nfnet_cfg(depths=(4, 8, 24, 12)),
-    nfnet_f4=_nfnet_cfg(depths=(5, 10, 30, 15)),
-    nfnet_f5=_nfnet_cfg(depths=(6, 12, 36, 18)),
-    nfnet_f6=_nfnet_cfg(depths=(7, 14, 42, 21)),
-    nfnet_f7=_nfnet_cfg(depths=(8, 16, 48, 24)),
-
-    # NFNet-F models w/ SiLU (much faster in PyTorch)
-    nfnet_f0s=_nfnet_cfg(depths=(1, 2, 6, 3), act_layer='silu'),
-    nfnet_f1s=_nfnet_cfg(depths=(2, 4, 12, 6), act_layer='silu'),
-    nfnet_f2s=_nfnet_cfg(depths=(3, 6, 18, 9), act_layer='silu'),
-    nfnet_f3s=_nfnet_cfg(depths=(4, 8, 24, 12), act_layer='silu'),
-    nfnet_f4s=_nfnet_cfg(depths=(5, 10, 30, 15), act_layer='silu'),
-    nfnet_f5s=_nfnet_cfg(depths=(6, 12, 36, 18), act_layer='silu'),
-    nfnet_f6s=_nfnet_cfg(depths=(7, 14, 42, 21), act_layer='silu'),
-    nfnet_f7s=_nfnet_cfg(depths=(8, 16, 48, 24), act_layer='silu'),
-
-    # Experimental 'light' versions of NFNet-F that are little leaner
-    nfnet_l0=_nfnet_cfg(
-        depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25,
-        attn_kwargs=dict(reduction_ratio=0.25, divisor=8), act_layer='silu'),
-    eca_nfnet_l0=_nfnet_cfg(
-        depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25,
-        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
-    eca_nfnet_l1=_nfnet_cfg(
-        depths=(2, 4, 12, 6), feat_mult=2, group_size=64, bottle_ratio=0.25,
-        attn_layer='eca', attn_kwargs=dict(), act_layer='silu'),
-
-    # EffNet influenced RegNet defs.
-    # NOTE: These aren't quite the official ver, ch_div=1 must be set for exact ch counts. I round to ch_div=8.
-    nf_regnet_b0=_nfreg_cfg(depths=(1, 3, 6, 6)),
-    nf_regnet_b1=_nfreg_cfg(depths=(2, 4, 7, 7)),
-    nf_regnet_b2=_nfreg_cfg(depths=(2, 4, 8, 8), channels=(56, 112, 232, 488)),
-    nf_regnet_b3=_nfreg_cfg(depths=(2, 5, 9, 9), channels=(56, 128, 248, 528)),
-    nf_regnet_b4=_nfreg_cfg(depths=(2, 6, 11, 11), channels=(64, 144, 288, 616)),
-    nf_regnet_b5=_nfreg_cfg(depths=(3, 7, 14, 14), channels=(80, 168, 336, 704)),
-    # FIXME add B6-B8
-
-    # ResNet (preact, D style deep stem/avg down) defs
-    nf_resnet26=_nfres_cfg(depths=(2, 2, 2, 2)),
-    nf_resnet50=_nfres_cfg(depths=(3, 4, 6, 3)),
-    nf_resnet101=_nfres_cfg(depths=(3, 4, 23, 3)),
-
-    nf_seresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)),
-    nf_seresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)),
-    nf_seresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='se', attn_kwargs=dict(reduction_ratio=1/16)),
-
-    nf_ecaresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='eca', attn_kwargs=dict()),
-    nf_ecaresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='eca', attn_kwargs=dict()),
-    nf_ecaresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='eca', attn_kwargs=dict()),
-
-)
-
-
-class GammaAct(nn.Module):
-    def __init__(self, act_type='relu', gamma: float = 1.0, inplace=False):
-        super().__init__()
-        self.act_fn = get_act_fn(act_type)
-        self.gamma = gamma
-        self.inplace = inplace
-
-    def forward(self, x):
-        return self.act_fn(x, inplace=self.inplace).mul_(self.gamma)
-
-
-def act_with_gamma(act_type, gamma: float = 1.):
-    def _create(inplace=False):
-        return GammaAct(act_type, gamma=gamma, inplace=inplace)
-    return _create
-
-
-class DownsampleAvg(nn.Module):
-    def __init__(
-            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, conv_layer=ScaledStdConv2d):
-        """ AvgPool Downsampling as in 'D' ResNet variants. Support for dilation."""
-        super(DownsampleAvg, self).__init__()
-        avg_stride = stride if dilation == 1 else 1
-        if stride > 1 or dilation > 1:
-            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-        else:
-            self.pool = nn.Identity()
-        self.conv = conv_layer(in_chs, out_chs, 1, stride=1)
-
-    def forward(self, x):
-        return self.conv(self.pool(x))
-
-
-class NormFreeBlock(nn.Module):
-    """Normalization-Free pre-activation block.
-    """
-
-    def __init__(
-            self, in_chs, out_chs=None, stride=1, dilation=1, first_dilation=None,
-            alpha=1.0, beta=1.0, bottle_ratio=0.25, group_size=None, ch_div=1, reg=True, extra_conv=False,
-            skipinit=False, attn_layer=None, attn_gain=2.0, act_layer=None, conv_layer=None, drop_path_rate=0.):
-        super().__init__()
-        first_dilation = first_dilation or dilation
-        out_chs = out_chs or in_chs
-        # RegNet variants scale bottleneck from in_chs, otherwise scale from out_chs like ResNet
-        mid_chs = make_divisible(in_chs * bottle_ratio if reg else out_chs * bottle_ratio, ch_div)
-        groups = 1 if not group_size else mid_chs // group_size
-        if group_size and group_size % ch_div == 0:
-            mid_chs = group_size * groups  # correct mid_chs if group_size divisible by ch_div, otherwise error
-        self.alpha = alpha
-        self.beta = beta
-        self.attn_gain = attn_gain
-
-        if in_chs != out_chs or stride != 1 or dilation != first_dilation:
-            self.downsample = DownsampleAvg(
-                in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, conv_layer=conv_layer)
-        else:
-            self.downsample = None
-
-        self.act1 = act_layer()
-        self.conv1 = conv_layer(in_chs, mid_chs, 1)
-        self.act2 = act_layer(inplace=True)
-        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-        if extra_conv:
-            self.act2b = act_layer(inplace=True)
-            self.conv2b = conv_layer(mid_chs, mid_chs, 3, stride=1, dilation=dilation, groups=groups)
-        else:
-            self.act2b = None
-            self.conv2b = None
-        if reg and attn_layer is not None:
-            self.attn = attn_layer(mid_chs)  # RegNet blocks apply attn btw conv2 & 3
-        else:
-            self.attn = None
-        self.act3 = act_layer()
-        self.conv3 = conv_layer(mid_chs, out_chs, 1, gain_init=1. if skipinit else 0.)
-        if not reg and attn_layer is not None:
-            self.attn_last = attn_layer(out_chs)  # ResNet blocks apply attn after conv3
-        else:
-            self.attn_last = None
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.skipinit_gain = nn.Parameter(torch.tensor(0.)) if skipinit else None
-
-    def forward(self, x):
-        out = self.act1(x) * self.beta
-
-        # shortcut branch
-        shortcut = x
-        if self.downsample is not None:
-            shortcut = self.downsample(out)
-
-        # residual branch
-        out = self.conv1(out)
-        out = self.conv2(self.act2(out))
-        if self.conv2b is not None:
-            out = self.conv2b(self.act2b(out))
-        if self.attn is not None:
-            out = self.attn_gain * self.attn(out)
-        out = self.conv3(self.act3(out))
-        if self.attn_last is not None:
-            out = self.attn_gain * self.attn_last(out)
-        out = self.drop_path(out)
-
-        if self.skipinit_gain is not None:
-            out.mul_(self.skipinit_gain)  # this slows things down more than expected, TBD
-        out = out * self.alpha + shortcut
-        return out
-
-
-def create_stem(in_chs, out_chs, stem_type='', conv_layer=None, act_layer=None, preact_feature=True):
-    stem_stride = 2
-    stem_feature = dict(num_chs=out_chs, reduction=2, module='stem.conv')
-    stem = OrderedDict()
-    assert stem_type in ('', 'deep', 'deep_tiered', 'deep_quad', '3x3', '7x7', 'deep_pool', '3x3_pool', '7x7_pool')
-    if 'deep' in stem_type:
-        if 'quad' in stem_type:
-            # 4 deep conv stack as in NFNet-F models
-            assert not 'pool' in stem_type
-            stem_chs = (out_chs // 8, out_chs // 4, out_chs // 2, out_chs)
-            strides = (2, 1, 1, 2)
-            stem_stride = 4
-            stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv3')
-        else:
-            if 'tiered' in stem_type:
-                stem_chs = (3 * out_chs // 8, out_chs // 2, out_chs)  # 'T' resnets in resnet.py
-            else:
-                stem_chs = (out_chs // 2, out_chs // 2, out_chs)  # 'D' ResNets
-            strides = (2, 1, 1)
-            stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv2')
-        last_idx = len(stem_chs) - 1
-        for i, (c, s) in enumerate(zip(stem_chs, strides)):
-            stem[f'conv{i + 1}'] = conv_layer(in_chs, c, kernel_size=3, stride=s)
-            if i != last_idx:
-                stem[f'act{i + 2}'] = act_layer(inplace=True)
-            in_chs = c
-    elif '3x3' in stem_type:
-        # 3x3 stem conv as in RegNet
-        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=3, stride=2)
-    else:
-        # 7x7 stem conv as in ResNet
-        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2)
-
-    if 'pool' in stem_type:
-        stem['pool'] = nn.MaxPool2d(3, stride=2, padding=1)
-        stem_stride = 4
-
-    return nn.Sequential(stem), stem_stride, stem_feature
-
-
-# from https://github.com/deepmind/deepmind-research/tree/master/nfnets
-_nonlin_gamma = dict(
-    identity=1.0,
-    celu=1.270926833152771,
-    elu=1.2716004848480225,
-    gelu=1.7015043497085571,
-    leaky_relu=1.70590341091156,
-    log_sigmoid=1.9193484783172607,
-    log_softmax=1.0002083778381348,
-    relu=1.7139588594436646,
-    relu6=1.7131484746932983,
-    selu=1.0008515119552612,
-    sigmoid=4.803835391998291,
-    silu=1.7881293296813965,
-    softsign=2.338853120803833,
-    softplus=1.9203323125839233,
-    tanh=1.5939117670059204,
-)
-
-
-class NormFreeNet(nn.Module):
-    """ Normalization-Free Network
-
-    As described in :
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    and
-    `High-Performance Large-Scale Image Recognition Without Normalization` - https://arxiv.org/abs/2102.06171
-
-    This model aims to cover both the NFRegNet-Bx models as detailed in the paper's code snippets and
-    the (preact) ResNet models described earlier in the paper.
-
-    There are a few differences:
-        * channels are rounded to be divisible by 8 by default (keep tensor core kernels happy),
-            this changes channel dim and param counts slightly from the paper models
-        * activation correcting gamma constants are moved into the ScaledStdConv as it has less performance
-            impact in PyTorch when done with the weight scaling there. This likely wasn't a concern in the JAX impl.
-        * a config option `gamma_in_act` can be enabled to not apply gamma in StdConv as described above, but
-            apply it in each activation. This is slightly slower, numerically different, but matches official impl.
-        * skipinit is disabled by default, it seems to have a rather drastic impact on GPU memory use and throughput
-            for what it is/does. Approx 8-10% throughput loss.
-    """
-    def __init__(self, cfg: NfCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
-                 drop_rate=0., drop_path_rate=0.):
-        super().__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        assert cfg.act_layer in _nonlin_gamma, f"Please add non-linearity constants for activation ({cfg.act_layer})."
-        conv_layer = ScaledStdConv2dSame if cfg.same_padding else ScaledStdConv2d
-        if cfg.gamma_in_act:
-            act_layer = act_with_gamma(cfg.act_layer, gamma=_nonlin_gamma[cfg.act_layer])
-            conv_layer = partial(conv_layer, eps=1e-4)  # DM weights better with higher eps
-        else:
-            act_layer = get_act_layer(cfg.act_layer)
-            conv_layer = partial(conv_layer, gamma=_nonlin_gamma[cfg.act_layer])
-        attn_layer = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
-
-        stem_chs = make_divisible((cfg.stem_chs or cfg.channels[0]) * cfg.width_factor, cfg.ch_div)
-        self.stem, stem_stride, stem_feat = create_stem(
-            in_chans, stem_chs, cfg.stem_type, conv_layer=conv_layer, act_layer=act_layer)
-
-        self.feature_info = [stem_feat]
-        drop_path_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)]
-        prev_chs = stem_chs
-        net_stride = stem_stride
-        dilation = 1
-        expected_var = 1.0
-        stages = []
-        for stage_idx, stage_depth in enumerate(cfg.depths):
-            stride = 1 if stage_idx == 0 and stem_stride > 2 else 2
-            if net_stride >= output_stride and stride > 1:
-                dilation *= stride
-                stride = 1
-            net_stride *= stride
-            first_dilation = 1 if dilation in (1, 2) else 2
-
-            blocks = []
-            for block_idx in range(cfg.depths[stage_idx]):
-                first_block = block_idx == 0 and stage_idx == 0
-                out_chs = make_divisible(cfg.channels[stage_idx] * cfg.width_factor, cfg.ch_div)
-                blocks += [NormFreeBlock(
-                    in_chs=prev_chs, out_chs=out_chs,
-                    alpha=cfg.alpha,
-                    beta=1. / expected_var ** 0.5,
-                    stride=stride if block_idx == 0 else 1,
-                    dilation=dilation,
-                    first_dilation=first_dilation,
-                    group_size=cfg.group_size,
-                    bottle_ratio=1. if cfg.reg and first_block else cfg.bottle_ratio,
-                    ch_div=cfg.ch_div,
-                    reg=cfg.reg,
-                    extra_conv=cfg.extra_conv,
-                    skipinit=cfg.skipinit,
-                    attn_layer=attn_layer,
-                    attn_gain=cfg.attn_gain,
-                    act_layer=act_layer,
-                    conv_layer=conv_layer,
-                    drop_path_rate=drop_path_rates[stage_idx][block_idx],
-                )]
-                if block_idx == 0:
-                    expected_var = 1.  # expected var is reset after first block of each stage
-                expected_var += cfg.alpha ** 2   # Even if reset occurs, increment expected variance
-                first_dilation = dilation
-                prev_chs = out_chs
-            self.feature_info += [dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')]
-            stages += [nn.Sequential(*blocks)]
-        self.stages = nn.Sequential(*stages)
-
-        if cfg.num_features:
-            # The paper NFRegNet models have an EfficientNet-like final head convolution.
-            self.num_features = make_divisible(cfg.width_factor * cfg.num_features, cfg.ch_div)
-            self.final_conv = conv_layer(prev_chs, self.num_features, 1)
-            self.feature_info[-1] = dict(num_chs=self.num_features, reduction=net_stride, module=f'final_conv')
-        else:
-            self.num_features = prev_chs
-            self.final_conv = nn.Identity()
-        self.final_act = act_layer(inplace=cfg.num_features > 0)
-
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-        for n, m in self.named_modules():
-            if 'fc' in n and isinstance(m, nn.Linear):
-                if cfg.zero_init_fc:
-                    nn.init.zeros_(m.weight)
-                else:
-                    nn.init.normal_(m.weight, 0., .01)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear')
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        x = self.final_conv(x)
-        x = self.final_act(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_normfreenet(variant, pretrained=False, **kwargs):
-    model_cfg = model_cfgs[variant]
-    feature_cfg = dict(flatten_sequential=True)
-    return build_model_with_cfg(
-        NormFreeNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=model_cfg,
-        feature_cfg=feature_cfg,
-        **kwargs)
-
-
-@register_model
-def dm_nfnet_f0(pretrained=False, **kwargs):
-    """ NFNet-F0 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f1(pretrained=False, **kwargs):
-    """ NFNet-F1 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f2(pretrained=False, **kwargs):
-    """ NFNet-F2 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f3(pretrained=False, **kwargs):
-    """ NFNet-F3 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f3', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f4(pretrained=False, **kwargs):
-    """ NFNet-F4 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f4', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f5(pretrained=False, **kwargs):
-    """ NFNet-F5 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f5', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def dm_nfnet_f6(pretrained=False, **kwargs):
-    """ NFNet-F6 (DeepMind weight compatible)
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('dm_nfnet_f6', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f0(pretrained=False, **kwargs):
-    """ NFNet-F0
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f1(pretrained=False, **kwargs):
-    """ NFNet-F1
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f2(pretrained=False, **kwargs):
-    """ NFNet-F2
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f3(pretrained=False, **kwargs):
-    """ NFNet-F3
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f3', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f4(pretrained=False, **kwargs):
-    """ NFNet-F4
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f4', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f5(pretrained=False, **kwargs):
-    """ NFNet-F5
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f5', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f6(pretrained=False, **kwargs):
-    """ NFNet-F6
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f6', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f7(pretrained=False, **kwargs):
-    """ NFNet-F7
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f7', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f0s(pretrained=False, **kwargs):
-    """ NFNet-F0 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f0s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f1s(pretrained=False, **kwargs):
-    """ NFNet-F1 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f1s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f2s(pretrained=False, **kwargs):
-    """ NFNet-F2 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f2s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f3s(pretrained=False, **kwargs):
-    """ NFNet-F3 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f3s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f4s(pretrained=False, **kwargs):
-    """ NFNet-F4 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f4s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f5s(pretrained=False, **kwargs):
-    """ NFNet-F5 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f5s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f6s(pretrained=False, **kwargs):
-    """ NFNet-F6 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f6s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_f7s(pretrained=False, **kwargs):
-    """ NFNet-F7 w/ SiLU
-    `High-Performance Large-Scale Image Recognition Without Normalization`
-        - https://arxiv.org/abs/2102.06171
-    """
-    return _create_normfreenet('nfnet_f7s', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nfnet_l0(pretrained=False, **kwargs):
-    """ NFNet-L0b w/ SiLU
-    My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & SE ratio
-    """
-    return _create_normfreenet('nfnet_l0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def eca_nfnet_l0(pretrained=False, **kwargs):
-    """ ECA-NFNet-L0 w/ SiLU
-    My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
-    """
-    return _create_normfreenet('eca_nfnet_l0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def eca_nfnet_l1(pretrained=False, **kwargs):
-    """ ECA-NFNet-L1 w/ SiLU
-    My experimental 'light' model w/ F1 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn
-    """
-    return _create_normfreenet('eca_nfnet_l1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b0(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B0
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b0', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b1(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B1
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b1', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b2(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B2
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b2', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b3(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B3
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b3', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b4(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B4
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b4', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_regnet_b5(pretrained=False, **kwargs):
-    """ Normalization-Free RegNet-B5
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_regnet_b5', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_resnet26(pretrained=False, **kwargs):
-    """ Normalization-Free ResNet-26
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_resnet26', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_resnet50(pretrained=False, **kwargs):
-    """ Normalization-Free ResNet-50
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_resnet50', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_resnet101(pretrained=False, **kwargs):
-    """ Normalization-Free ResNet-101
-    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
-        - https://arxiv.org/abs/2101.08692
-    """
-    return _create_normfreenet('nf_resnet101', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_seresnet26(pretrained=False, **kwargs):
-    """ Normalization-Free SE-ResNet26
-    """
-    return _create_normfreenet('nf_seresnet26', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_seresnet50(pretrained=False, **kwargs):
-    """ Normalization-Free SE-ResNet50
-    """
-    return _create_normfreenet('nf_seresnet50', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_seresnet101(pretrained=False, **kwargs):
-    """ Normalization-Free SE-ResNet101
-    """
-    return _create_normfreenet('nf_seresnet101', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_ecaresnet26(pretrained=False, **kwargs):
-    """ Normalization-Free ECA-ResNet26
-    """
-    return _create_normfreenet('nf_ecaresnet26', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_ecaresnet50(pretrained=False, **kwargs):
-    """ Normalization-Free ECA-ResNet50
-    """
-    return _create_normfreenet('nf_ecaresnet50', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def nf_ecaresnet101(pretrained=False, **kwargs):
-    """ Normalization-Free ECA-ResNet101
-    """
-    return _create_normfreenet('nf_ecaresnet101', pretrained=pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/pit.py b/AVLFormer/src/timm/models/pit.py
deleted file mode 100644
index 8b3c49e..0000000
--- a/AVLFormer/src/timm/models/pit.py
+++ /dev/null
@@ -1,388 +0,0 @@
-""" Pooling-based Vision Transformer (PiT) in PyTorch
-
-A PyTorch implement of Pooling-based Vision Transformers as described in
-'Rethinking Spatial Dimensions of Vision Transformers' - https://arxiv.org/abs/2103.16302
-
-This code was adapted from the original version at https://github.com/naver-ai/pit, original copyright below.
-
-Modifications for timm by / Copyright 2020 Ross Wightman
-"""
-# PiT
-# Copyright 2021-present NAVER Corp.
-# Apache License v2.0
-
-import math
-import re
-from copy import deepcopy
-from functools import partial
-from typing import Tuple
-
-import torch
-from torch import nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg, overlay_external_default_cfg
-from .layers import trunc_normal_, to_2tuple
-from .registry import register_model
-from .vision_transformer import Block
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'patch_embed.conv', 'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # deit models (FB weights)
-    'pit_ti_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_730.pth'),
-    'pit_xs_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_781.pth'),
-    'pit_s_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_809.pth'),
-    'pit_b_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_820.pth'),
-    'pit_ti_distilled_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_distill_746.pth',
-        classifier=('head', 'head_dist')),
-    'pit_xs_distilled_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_distill_791.pth',
-        classifier=('head', 'head_dist')),
-    'pit_s_distilled_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_distill_819.pth',
-        classifier=('head', 'head_dist')),
-    'pit_b_distilled_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_distill_840.pth',
-        classifier=('head', 'head_dist')),
-}
-
-
-class SequentialTuple(nn.Sequential):
-    """ This module exists to work around torchscript typing issues list -> list"""
-    def __init__(self, *args):
-        super(SequentialTuple, self).__init__(*args)
-
-    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
-        for module in self:
-            x = module(x)
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(
-            self, base_dim, depth, heads, mlp_ratio, pool=None, drop_rate=.0, attn_drop_rate=.0, drop_path_prob=None):
-        super(Transformer, self).__init__()
-        self.layers = nn.ModuleList([])
-        embed_dim = base_dim * heads
-
-        self.blocks = nn.Sequential(*[
-            Block(
-                dim=embed_dim,
-                num_heads=heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=True,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=drop_path_prob[i],
-                norm_layer=partial(nn.LayerNorm, eps=1e-6)
-            )
-            for i in range(depth)])
-
-        self.pool = pool
-
-    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
-        x, cls_tokens = x
-        B, C, H, W = x.shape
-        token_length = cls_tokens.shape[1]
-
-        x = x.flatten(2).transpose(1, 2)
-        x = torch.cat((cls_tokens, x), dim=1)
-
-        x = self.blocks(x)
-
-        cls_tokens = x[:, :token_length]
-        x = x[:, token_length:]
-        x = x.transpose(1, 2).reshape(B, C, H, W)
-
-        if self.pool is not None:
-            x, cls_tokens = self.pool(x, cls_tokens)
-        return x, cls_tokens
-
-
-class ConvHeadPooling(nn.Module):
-    def __init__(self, in_feature, out_feature, stride, padding_mode='zeros'):
-        super(ConvHeadPooling, self).__init__()
-
-        self.conv = nn.Conv2d(
-            in_feature, out_feature, kernel_size=stride + 1, padding=stride // 2, stride=stride,
-            padding_mode=padding_mode, groups=in_feature)
-        self.fc = nn.Linear(in_feature, out_feature)
-
-    def forward(self, x, cls_token) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        x = self.conv(x)
-        cls_token = self.fc(cls_token)
-
-        return x, cls_token
-
-
-class ConvEmbedding(nn.Module):
-    def __init__(self, in_channels, out_channels, patch_size, stride, padding):
-        super(ConvEmbedding, self).__init__()
-        self.conv = nn.Conv2d(
-            in_channels, out_channels, kernel_size=patch_size, stride=stride, padding=padding, bias=True)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return x
-
-
-class PoolingVisionTransformer(nn.Module):
-    """ Pooling-based Vision Transformer
-
-    A PyTorch implement of 'Rethinking Spatial Dimensions of Vision Transformers'
-        - https://arxiv.org/abs/2103.16302
-    """
-    def __init__(self, img_size, patch_size, stride, base_dims, depth, heads,
-                 mlp_ratio, num_classes=1000, in_chans=3, distilled=False,
-                 attn_drop_rate=.0, drop_rate=.0, drop_path_rate=.0):
-        super(PoolingVisionTransformer, self).__init__()
-
-        padding = 0
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        height = math.floor((img_size[0] + 2 * padding - patch_size[0]) / stride + 1)
-        width = math.floor((img_size[1] + 2 * padding - patch_size[1]) / stride + 1)
-
-        self.base_dims = base_dims
-        self.heads = heads
-        self.num_classes = num_classes
-        self.num_tokens = 2 if distilled else 1
-
-        self.patch_size = patch_size
-        self.pos_embed = nn.Parameter(torch.randn(1, base_dims[0] * heads[0], height, width))
-        self.patch_embed = ConvEmbedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding)
-
-        self.cls_token = nn.Parameter(torch.randn(1, self.num_tokens, base_dims[0] * heads[0]))
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        transformers = []
-        # stochastic depth decay rule
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depth)).split(depth)]
-        for stage in range(len(depth)):
-            pool = None
-            if stage < len(heads) - 1:
-                pool = ConvHeadPooling(
-                    base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2)
-            transformers += [Transformer(
-                base_dims[stage], depth[stage], heads[stage], mlp_ratio, pool=pool,
-                drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_prob=dpr[stage])
-            ]
-        self.transformers = SequentialTuple(*transformers)
-        self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], eps=1e-6)
-        self.embed_dim = base_dims[-1] * heads[-1]
-
-        # Classifier head
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        self.head_dist = nn.Linear(self.embed_dim, self.num_classes) \
-            if num_classes > 0 and distilled else nn.Identity()
-
-        trunc_normal_(self.pos_embed, std=.02)
-        trunc_normal_(self.cls_token, std=.02)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=''):
-        self.num_classes = num_classes
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        self.head_dist = nn.Linear(self.embed_dim, self.num_classes) \
-            if num_classes > 0 and self.num_tokens == 2 else nn.Identity()
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        x = self.pos_drop(x + self.pos_embed)
-        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
-        x, cls_tokens = self.transformers((x, cls_tokens))
-        cls_tokens = self.norm(cls_tokens)
-        return cls_tokens
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x_cls = self.head(x[:, 0])
-        if self.num_tokens > 1:
-            x_dist = self.head_dist(x[:, 1])
-            if self.training and not torch.jit.is_scripting():
-                return x_cls, x_dist
-            else:
-                return (x_cls + x_dist) / 2
-        else:
-            return x_cls
-
-
-def checkpoint_filter_fn(state_dict, model):
-    """ preprocess checkpoints """
-    out_dict = {}
-    p_blocks = re.compile(r'pools\.(\d)\.')
-    for k, v in state_dict.items():
-        # FIXME need to update resize for PiT impl
-        # if k == 'pos_embed' and v.shape != model.pos_embed.shape:
-        #     # To resize pos embedding when using model at different size from pretrained weights
-        #     v = resize_pos_embed(v, model.pos_embed)
-        k = p_blocks.sub(lambda exp: f'transformers.{int(exp.group(1))}.pool.', k)
-        out_dict[k] = v
-    return out_dict
-
-
-def _create_pit(variant, pretrained=False, **kwargs):
-    default_cfg = deepcopy(default_cfgs[variant])
-    overlay_external_default_cfg(default_cfg, kwargs)
-    default_num_classes = default_cfg['num_classes']
-    default_img_size = default_cfg['input_size'][-2:]
-    img_size = kwargs.pop('img_size', default_img_size)
-    num_classes = kwargs.pop('num_classes', default_num_classes)
-
-    if kwargs.get('features_only', None):
-        raise RuntimeError('features_only not implemented for Vision Transformer models.')
-
-    model = build_model_with_cfg(
-        PoolingVisionTransformer, variant, pretrained,
-        default_cfg=default_cfg,
-        img_size=img_size,
-        num_classes=num_classes,
-        pretrained_filter_fn=checkpoint_filter_fn,
-        **kwargs)
-
-    return model
-
-
-@register_model
-def pit_b_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=14,
-        stride=7,
-        base_dims=[64, 64, 64],
-        depth=[3, 6, 4],
-        heads=[4, 8, 16],
-        mlp_ratio=4,
-        **kwargs
-    )
-    return _create_pit('pit_b_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_s_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[48, 48, 48],
-        depth=[2, 6, 4],
-        heads=[3, 6, 12],
-        mlp_ratio=4,
-        **kwargs
-    )
-    return _create_pit('pit_s_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_xs_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[48, 48, 48],
-        depth=[2, 6, 4],
-        heads=[2, 4, 8],
-        mlp_ratio=4,
-        **kwargs
-    )
-    return _create_pit('pit_xs_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_ti_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[32, 32, 32],
-        depth=[2, 6, 4],
-        heads=[2, 4, 8],
-        mlp_ratio=4,
-        **kwargs
-    )
-    return _create_pit('pit_ti_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_b_distilled_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=14,
-        stride=7,
-        base_dims=[64, 64, 64],
-        depth=[3, 6, 4],
-        heads=[4, 8, 16],
-        mlp_ratio=4,
-        distilled=True,
-        **kwargs
-    )
-    return _create_pit('pit_b_distilled_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_s_distilled_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[48, 48, 48],
-        depth=[2, 6, 4],
-        heads=[3, 6, 12],
-        mlp_ratio=4,
-        distilled=True,
-        **kwargs
-    )
-    return _create_pit('pit_s_distilled_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_xs_distilled_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[48, 48, 48],
-        depth=[2, 6, 4],
-        heads=[2, 4, 8],
-        mlp_ratio=4,
-        distilled=True,
-        **kwargs
-    )
-    return _create_pit('pit_xs_distilled_224', pretrained, **model_kwargs)
-
-
-@register_model
-def pit_ti_distilled_224(pretrained, **kwargs):
-    model_kwargs = dict(
-        patch_size=16,
-        stride=8,
-        base_dims=[32, 32, 32],
-        depth=[2, 6, 4],
-        heads=[2, 4, 8],
-        mlp_ratio=4,
-        distilled=True,
-        **kwargs
-    )
-    return _create_pit('pit_ti_distilled_224', pretrained, **model_kwargs)
\ No newline at end of file
diff --git a/AVLFormer/src/timm/models/pnasnet.py b/AVLFormer/src/timm/models/pnasnet.py
deleted file mode 100644
index 9991815..0000000
--- a/AVLFormer/src/timm/models/pnasnet.py
+++ /dev/null
@@ -1,350 +0,0 @@
-"""
- pnasnet5large implementation grabbed from Cadene's pretrained models
- Additional credit to https://github.com/creafz
-
- https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/pnasnet.py
-
-"""
-from collections import OrderedDict
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, create_conv2d, create_pool2d, create_classifier
-from .registry import register_model
-
-__all__ = ['PNASNet5Large']
-
-default_cfgs = {
-    'pnasnet5large': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/pnasnet5large-bf079911.pth',
-        'input_size': (3, 331, 331),
-        'pool_size': (11, 11),
-        'crop_pct': 0.911,
-        'interpolation': 'bicubic',
-        'mean': (0.5, 0.5, 0.5),
-        'std': (0.5, 0.5, 0.5),
-        'num_classes': 1000,
-        'first_conv': 'conv_0.conv',
-        'classifier': 'last_linear',
-        'label_offset': 1,  # 1001 classes in pretrained weights
-    },
-}
-
-
-class SeparableConv2d(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''):
-        super(SeparableConv2d, self).__init__()
-        self.depthwise_conv2d = create_conv2d(
-            in_channels, in_channels, kernel_size=kernel_size,
-            stride=stride, padding=padding, groups=in_channels)
-        self.pointwise_conv2d = create_conv2d(
-            in_channels, out_channels, kernel_size=1, padding=padding)
-
-    def forward(self, x):
-        x = self.depthwise_conv2d(x)
-        x = self.pointwise_conv2d(x)
-        return x
-
-
-class BranchSeparables(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, stem_cell=False, padding=''):
-        super(BranchSeparables, self).__init__()
-        middle_channels = out_channels if stem_cell else in_channels
-        self.act_1 = nn.ReLU()
-        self.separable_1 = SeparableConv2d(
-            in_channels, middle_channels, kernel_size, stride=stride, padding=padding)
-        self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001)
-        self.act_2 = nn.ReLU()
-        self.separable_2 = SeparableConv2d(
-            middle_channels, out_channels, kernel_size, stride=1, padding=padding)
-        self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x):
-        x = self.act_1(x)
-        x = self.separable_1(x)
-        x = self.bn_sep_1(x)
-        x = self.act_2(x)
-        x = self.separable_2(x)
-        x = self.bn_sep_2(x)
-        return x
-
-
-class ActConvBn(nn.Module):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''):
-        super(ActConvBn, self).__init__()
-        self.act = nn.ReLU()
-        self.conv = create_conv2d(
-            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x):
-        x = self.act(x)
-        x = self.conv(x)
-        x = self.bn(x)
-        return x
-
-
-class FactorizedReduction(nn.Module):
-
-    def __init__(self, in_channels, out_channels, padding=''):
-        super(FactorizedReduction, self).__init__()
-        self.act = nn.ReLU()
-        self.path_1 = nn.Sequential(OrderedDict([
-            ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)),
-            ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)),
-        ]))
-        self.path_2 = nn.Sequential(OrderedDict([
-            ('pad', nn.ZeroPad2d((-1, 1, -1, 1))),  # shift
-            ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)),
-            ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)),
-        ]))
-        self.final_path_bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x):
-        x = self.act(x)
-        x_path1 = self.path_1(x)
-        x_path2 = self.path_2(x)
-        out = self.final_path_bn(torch.cat([x_path1, x_path2], 1))
-        return out
-
-
-class CellBase(nn.Module):
-
-    def cell_forward(self, x_left, x_right):
-        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
-        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
-        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right
-
-        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
-        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
-        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right
-
-        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
-        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
-        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right
-
-        x_comb_iter_3_left = self.comb_iter_3_left(x_comb_iter_2)
-        x_comb_iter_3_right = self.comb_iter_3_right(x_right)
-        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
-
-        x_comb_iter_4_left = self.comb_iter_4_left(x_left)
-        if self.comb_iter_4_right is not None:
-            x_comb_iter_4_right = self.comb_iter_4_right(x_right)
-        else:
-            x_comb_iter_4_right = x_right
-        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
-
-        x_out = torch.cat([x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
-        return x_out
-
-
-class CellStem0(CellBase):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''):
-        super(CellStem0, self).__init__()
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type)
-
-        self.comb_iter_0_left = BranchSeparables(
-            in_chs_left, out_chs_left, kernel_size=5, stride=2, stem_cell=True, padding=pad_type)
-        self.comb_iter_0_right = nn.Sequential(OrderedDict([
-            ('max_pool', create_pool2d('max', 3, stride=2, padding=pad_type)),
-            ('conv', create_conv2d(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type)),
-            ('bn', nn.BatchNorm2d(out_chs_left, eps=0.001)),
-        ]))
-
-        self.comb_iter_1_left = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=7, stride=2, padding=pad_type)
-        self.comb_iter_1_right = create_pool2d('max', 3, stride=2, padding=pad_type)
-
-        self.comb_iter_2_left = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=5, stride=2, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=3, stride=2, padding=pad_type)
-
-        self.comb_iter_3_left = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=3, padding=pad_type)
-        self.comb_iter_3_right = create_pool2d('max', 3, stride=2, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(
-            in_chs_right, out_chs_right, kernel_size=3, stride=2, stem_cell=True, padding=pad_type)
-        self.comb_iter_4_right = ActConvBn(
-            out_chs_right, out_chs_right, kernel_size=1, stride=2, padding=pad_type)
-
-    def forward(self, x_left):
-        x_right = self.conv_1x1(x_left)
-        x_out = self.cell_forward(x_left, x_right)
-        return x_out
-
-
-class Cell(CellBase):
-
-    def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type='',
-                 is_reduction=False, match_prev_layer_dims=False):
-        super(Cell, self).__init__()
-
-        # If `is_reduction` is set to `True` stride 2 is used for
-        # convolution and pooling layers to reduce the spatial size of
-        # the output of a cell approximately by a factor of 2.
-        stride = 2 if is_reduction else 1
-
-        # If `match_prev_layer_dimensions` is set to `True`
-        # `FactorizedReduction` is used to reduce the spatial size
-        # of the left input of a cell approximately by a factor of 2.
-        self.match_prev_layer_dimensions = match_prev_layer_dims
-        if match_prev_layer_dims:
-            self.conv_prev_1x1 = FactorizedReduction(in_chs_left, out_chs_left, padding=pad_type)
-        else:
-            self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type)
-        self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type)
-
-        self.comb_iter_0_left = BranchSeparables(
-            out_chs_left, out_chs_left, kernel_size=5, stride=stride, padding=pad_type)
-        self.comb_iter_0_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
-
-        self.comb_iter_1_left = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=7, stride=stride, padding=pad_type)
-        self.comb_iter_1_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
-
-        self.comb_iter_2_left = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=5, stride=stride, padding=pad_type)
-        self.comb_iter_2_right = BranchSeparables(
-            out_chs_right, out_chs_right, kernel_size=3, stride=stride, padding=pad_type)
-
-        self.comb_iter_3_left = BranchSeparables(out_chs_right, out_chs_right, kernel_size=3)
-        self.comb_iter_3_right = create_pool2d('max', 3, stride=stride, padding=pad_type)
-
-        self.comb_iter_4_left = BranchSeparables(
-            out_chs_left, out_chs_left, kernel_size=3, stride=stride, padding=pad_type)
-        if is_reduction:
-            self.comb_iter_4_right = ActConvBn(
-                out_chs_right, out_chs_right, kernel_size=1, stride=stride, padding=pad_type)
-        else:
-            self.comb_iter_4_right = None
-
-    def forward(self, x_left, x_right):
-        x_left = self.conv_prev_1x1(x_left)
-        x_right = self.conv_1x1(x_right)
-        x_out = self.cell_forward(x_left, x_right)
-        return x_out
-
-
-class PNASNet5Large(nn.Module):
-    def __init__(self, num_classes=1000, in_chans=3, output_stride=32, drop_rate=0., global_pool='avg', pad_type=''):
-        super(PNASNet5Large, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        self.num_features = 4320
-        assert output_stride == 32
-
-        self.conv_0 = ConvBnAct(
-            in_chans, 96, kernel_size=3, stride=2, padding=0,
-            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)
-
-        self.cell_stem_0 = CellStem0(
-            in_chs_left=96, out_chs_left=54, in_chs_right=96, out_chs_right=54, pad_type=pad_type)
-
-        self.cell_stem_1 = Cell(
-            in_chs_left=96, out_chs_left=108, in_chs_right=270, out_chs_right=108, pad_type=pad_type,
-            match_prev_layer_dims=True, is_reduction=True)
-        self.cell_0 = Cell(
-            in_chs_left=270, out_chs_left=216, in_chs_right=540, out_chs_right=216, pad_type=pad_type,
-            match_prev_layer_dims=True)
-        self.cell_1 = Cell(
-            in_chs_left=540, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
-        self.cell_2 = Cell(
-            in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
-        self.cell_3 = Cell(
-            in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type)
-
-        self.cell_4 = Cell(
-            in_chs_left=1080, out_chs_left=432, in_chs_right=1080, out_chs_right=432, pad_type=pad_type,
-            is_reduction=True)
-        self.cell_5 = Cell(
-            in_chs_left=1080, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type,
-            match_prev_layer_dims=True)
-        self.cell_6 = Cell(
-            in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type)
-        self.cell_7 = Cell(
-            in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type)
-
-        self.cell_8 = Cell(
-            in_chs_left=2160, out_chs_left=864, in_chs_right=2160, out_chs_right=864, pad_type=pad_type,
-            is_reduction=True)
-        self.cell_9 = Cell(
-            in_chs_left=2160, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type,
-            match_prev_layer_dims=True)
-        self.cell_10 = Cell(
-            in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type)
-        self.cell_11 = Cell(
-            in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type)
-        self.act = nn.ReLU()
-        self.feature_info = [
-            dict(num_chs=96, reduction=2, module='conv_0'),
-            dict(num_chs=270, reduction=4, module='cell_stem_1.conv_1x1.act'),
-            dict(num_chs=1080, reduction=8, module='cell_4.conv_1x1.act'),
-            dict(num_chs=2160, reduction=16, module='cell_8.conv_1x1.act'),
-            dict(num_chs=4320, reduction=32, module='act'),
-        ]
-
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def get_classifier(self):
-        return self.last_linear
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x_conv_0 = self.conv_0(x)
-        x_stem_0 = self.cell_stem_0(x_conv_0)
-        x_stem_1 = self.cell_stem_1(x_conv_0, x_stem_0)
-        x_cell_0 = self.cell_0(x_stem_0, x_stem_1)
-        x_cell_1 = self.cell_1(x_stem_1, x_cell_0)
-        x_cell_2 = self.cell_2(x_cell_0, x_cell_1)
-        x_cell_3 = self.cell_3(x_cell_1, x_cell_2)
-        x_cell_4 = self.cell_4(x_cell_2, x_cell_3)
-        x_cell_5 = self.cell_5(x_cell_3, x_cell_4)
-        x_cell_6 = self.cell_6(x_cell_4, x_cell_5)
-        x_cell_7 = self.cell_7(x_cell_5, x_cell_6)
-        x_cell_8 = self.cell_8(x_cell_6, x_cell_7)
-        x_cell_9 = self.cell_9(x_cell_7, x_cell_8)
-        x_cell_10 = self.cell_10(x_cell_8, x_cell_9)
-        x_cell_11 = self.cell_11(x_cell_9, x_cell_10)
-        x = self.act(x_cell_11)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0:
-            x = F.dropout(x, self.drop_rate, training=self.training)
-        x = self.last_linear(x)
-        return x
-
-
-def _create_pnasnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        PNASNet5Large, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(feature_cls='hook', no_rewrite=True),  # not possible to re-write this model
-        **kwargs)
-
-
-@register_model
-def pnasnet5large(pretrained=False, **kwargs):
-    r"""PNASNet-5 model architecture from the
-    `"Progressive Neural Architecture Search"
-    <https://arxiv.org/abs/1712.00559>`_ paper.
-    """
-    model_kwargs = dict(pad_type='same', **kwargs)
-    return _create_pnasnet('pnasnet5large', pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/registry.py b/AVLFormer/src/timm/models/registry.py
deleted file mode 100644
index 9172ac7..0000000
--- a/AVLFormer/src/timm/models/registry.py
+++ /dev/null
@@ -1,139 +0,0 @@
-""" Model Registry
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import sys
-import re
-import fnmatch
-from collections import defaultdict
-from copy import deepcopy
-
-__all__ = ['list_models', 'is_model', 'model_entrypoint', 'list_modules', 'is_model_in_modules',
-           'is_model_default_key', 'has_model_default_key', 'get_model_default_value', 'is_model_pretrained']
-
-_module_to_models = defaultdict(set)  # dict of sets to check membership of model in module
-_model_to_module = {}  # mapping of model names to module names
-_model_entrypoints = {}  # mapping of model names to entrypoint fns
-_model_has_pretrained = set()  # set of model names that have pretrained weight url present
-_model_default_cfgs = dict()  # central repo for model default_cfgs
-
-
-def register_model(fn):
-    # lookup containing module
-    mod = sys.modules[fn.__module__]
-    module_name_split = fn.__module__.split('.')
-    module_name = module_name_split[-1] if len(module_name_split) else ''
-
-    # add model to __all__ in module
-    model_name = fn.__name__
-    if hasattr(mod, '__all__'):
-        mod.__all__.append(model_name)
-    else:
-        mod.__all__ = [model_name]
-
-    # add entries to registry dict/sets
-    _model_entrypoints[model_name] = fn
-    _model_to_module[model_name] = module_name
-    _module_to_models[module_name].add(model_name)
-    has_pretrained = False  # check if model has a pretrained url to allow filtering on this
-    if hasattr(mod, 'default_cfgs') and model_name in mod.default_cfgs:
-        # this will catch all models that have entrypoint matching cfg key, but miss any aliasing
-        # entrypoints or non-matching combos
-        has_pretrained = 'url' in mod.default_cfgs[model_name] and 'http' in mod.default_cfgs[model_name]['url']
-        _model_default_cfgs[model_name] = deepcopy(mod.default_cfgs[model_name])
-    if has_pretrained:
-        _model_has_pretrained.add(model_name)
-    return fn
-
-
-def _natural_key(string_):
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
-
-
-def list_models(filter='', module='', pretrained=False, exclude_filters=''):
-    """ Return list of available model names, sorted alphabetically
-
-    Args:
-        filter (str) - Wildcard filter string that works with fnmatch
-        module (str) - Limit model selection to a specific sub-module (ie 'gen_efficientnet')
-        pretrained (bool) - Include only models with pretrained weights if True
-        exclude_filters (str or list[str]) - Wildcard filters to exclude models after including them with filter
-
-    Example:
-        model_list('gluon_resnet*') -- returns all models starting with 'gluon_resnet'
-        model_list('*resnext*, 'resnet') -- returns all models with 'resnext' in 'resnet' module
-    """
-    if module:
-        models = list(_module_to_models[module])
-    else:
-        models = _model_entrypoints.keys()
-    if filter:
-        models = fnmatch.filter(models, filter)  # include these models
-    if exclude_filters:
-        if not isinstance(exclude_filters, list):
-            exclude_filters = [exclude_filters]
-        for xf in exclude_filters:
-            exclude_models = fnmatch.filter(models, xf)  # exclude these models
-            if len(exclude_models):
-                models = set(models).difference(exclude_models)
-    if pretrained:
-        models = _model_has_pretrained.intersection(models)
-    return list(sorted(models, key=_natural_key))
-
-
-def is_model(model_name):
-    """ Check if a model name exists
-    """
-    return model_name in _model_entrypoints
-
-
-def model_entrypoint(model_name):
-    """Fetch a model entrypoint for specified model name
-    """
-    return _model_entrypoints[model_name]
-
-
-def list_modules():
-    """ Return list of module names that contain models / model entrypoints
-    """
-    modules = _module_to_models.keys()
-    return list(sorted(modules))
-
-
-def is_model_in_modules(model_name, module_names):
-    """Check if a model exists within a subset of modules
-    Args:
-        model_name (str) - name of model to check
-        module_names (tuple, list, set) - names of modules to search in
-    """
-    assert isinstance(module_names, (tuple, list, set))
-    return any(model_name in _module_to_models[n] for n in module_names)
-
-
-def has_model_default_key(model_name, cfg_key):
-    """ Query model default_cfgs for existence of a specific key.
-    """
-    if model_name in _model_default_cfgs and cfg_key in _model_default_cfgs[model_name]:
-        return True
-    return False
-
-
-def is_model_default_key(model_name, cfg_key):
-    """ Return truthy value for specified model default_cfg key, False if does not exist.
-    """
-    if model_name in _model_default_cfgs and _model_default_cfgs[model_name].get(cfg_key, False):
-        return True
-    return False
-
-
-def get_model_default_value(model_name, cfg_key):
-    """ Get a specific model default_cfg value by key. None if it doesn't exist.
-    """
-    if model_name in _model_default_cfgs:
-        return _model_default_cfgs[model_name].get(cfg_key, None)
-    else:
-        return None
-
-
-def is_model_pretrained(model_name):
-    return model_name in _model_has_pretrained
diff --git a/AVLFormer/src/timm/models/regnet.py b/AVLFormer/src/timm/models/regnet.py
deleted file mode 100644
index e4929d9..0000000
--- a/AVLFormer/src/timm/models/regnet.py
+++ /dev/null
@@ -1,494 +0,0 @@
-"""RegNet
-
-Paper: `Designing Network Design Spaces` - https://arxiv.org/abs/2003.13678
-Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
-
-Based on original PyTorch impl linked above, but re-wrote to use my own blocks (adapted from ResNet here)
-and cleaned up with more descriptive variable names.
-
-Weights from original impl have been modified
-* first layer from BGR -> RGB as most PyTorch models are
-* removed training specific dict entries from checkpoints and keep model state_dict only
-* remap names to match the ones here
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import numpy as np
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, AvgPool2dSame, ConvBnAct, SEModule, DropPath
-from .registry import register_model
-
-
-def _mcfg(**kwargs):
-    cfg = dict(se_ratio=0., bottle_ratio=1., stem_width=32)
-    cfg.update(**kwargs)
-    return cfg
-
-
-# Model FLOPS = three trailing digits * 10^8
-model_cfgs = dict(
-    regnetx_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13),
-    regnetx_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22),
-    regnetx_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16),
-    regnetx_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16),
-    regnetx_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18),
-    regnetx_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25),
-    regnetx_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23),
-    regnetx_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17),
-    regnetx_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23),
-    regnetx_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19),
-    regnetx_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22),
-    regnetx_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23),
-    regnety_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25),
-    regnety_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25),
-    regnety_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25),
-    regnety_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25),
-    regnety_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25),
-    regnety_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25),
-    regnety_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25),
-    regnety_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25),
-    regnety_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25),
-    regnety_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25),
-    regnety_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25),
-    regnety_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25),
-)
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = dict(
-    regnetx_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_002-e7e85e5c.pth'),
-    regnetx_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_004-7d0e9424.pth'),
-    regnetx_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_006-85ec1baa.pth'),
-    regnetx_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_008-d8b470eb.pth'),
-    regnetx_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_016-65ca972a.pth'),
-    regnetx_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_032-ed0c7f7e.pth'),
-    regnetx_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_040-73c2a654.pth'),
-    regnetx_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_064-29278baa.pth'),
-    regnetx_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_080-7c7fcab1.pth'),
-    regnetx_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'),
-    regnetx_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'),
-    regnetx_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'),
-    regnety_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'),
-    regnety_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'),
-    regnety_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'),
-    regnety_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_008-dc900dbe.pth'),
-    regnety_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_016-54367f74.pth'),
-    regnety_032=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/regnety_032_ra-7f2439f9.pth',
-        crop_pct=1.0, test_input_size=(3, 288, 288)),
-    regnety_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth'),
-    regnety_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_064-0a48325c.pth'),
-    regnety_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_080-e7f3eb93.pth'),
-    regnety_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_120-721ba79a.pth'),
-    regnety_160=_cfg(
-        url='https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth',  # from Facebook DeiT GitHub repository
-        crop_pct=1.0, test_input_size=(3, 288, 288)),
-    regnety_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'),
-)
-
-
-def quantize_float(f, q):
-    """Converts a float to closest non-zero int divisible by q."""
-    return int(round(f / q) * q)
-
-
-def adjust_widths_groups_comp(widths, bottle_ratios, groups):
-    """Adjusts the compatibility of widths and groups."""
-    bottleneck_widths = [int(w * b) for w, b in zip(widths, bottle_ratios)]
-    groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_widths)]
-    bottleneck_widths = [quantize_float(w_bot, g) for w_bot, g in zip(bottleneck_widths, groups)]
-    widths = [int(w_bot / b) for w_bot, b in zip(bottleneck_widths, bottle_ratios)]
-    return widths, groups
-
-
-def generate_regnet(width_slope, width_initial, width_mult, depth, q=8):
-    """Generates per block widths from RegNet parameters."""
-    assert width_slope >= 0 and width_initial > 0 and width_mult > 1 and width_initial % q == 0
-    widths_cont = np.arange(depth) * width_slope + width_initial
-    width_exps = np.round(np.log(widths_cont / width_initial) / np.log(width_mult))
-    widths = width_initial * np.power(width_mult, width_exps)
-    widths = np.round(np.divide(widths, q)) * q
-    num_stages, max_stage = len(np.unique(widths)), width_exps.max() + 1
-    widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
-    return widths, num_stages, max_stage, widths_cont
-
-
-class Bottleneck(nn.Module):
-    """ RegNet Bottleneck
-
-    This is almost exactly the same as a ResNet Bottlneck. The main difference is the SE block is moved from
-    after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels.
-    """
-
-    def __init__(self, in_chs, out_chs, stride=1, dilation=1, bottleneck_ratio=1, group_width=1, se_ratio=0.25,
-                 downsample=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None,
-                 drop_block=None, drop_path=None):
-        super(Bottleneck, self).__init__()
-        bottleneck_chs = int(round(out_chs * bottleneck_ratio))
-        groups = bottleneck_chs // group_width
-
-        cargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
-        self.conv1 = ConvBnAct(in_chs, bottleneck_chs, kernel_size=1, **cargs)
-        self.conv2 = ConvBnAct(
-            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation,
-            groups=groups, **cargs)
-        if se_ratio:
-            se_channels = int(round(in_chs * se_ratio))
-            self.se = SEModule(bottleneck_chs, reduction_channels=se_channels)
-        else:
-            self.se = None
-        cargs['act_layer'] = None
-        self.conv3 = ConvBnAct(bottleneck_chs, out_chs, kernel_size=1, **cargs)
-        self.act3 = act_layer(inplace=True)
-        self.downsample = downsample
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.conv3.bn.weight)
-
-    def forward(self, x):
-        shortcut = x
-        x = self.conv1(x)
-        x = self.conv2(x)
-        if self.se is not None:
-            x = self.se(x)
-        x = self.conv3(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        if self.downsample is not None:
-            shortcut = self.downsample(shortcut)
-        x += shortcut
-        x = self.act3(x)
-        return x
-
-
-def downsample_conv(
-        in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None):
-    norm_layer = norm_layer or nn.BatchNorm2d
-    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
-    dilation = dilation if kernel_size > 1 else 1
-    return ConvBnAct(
-        in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, norm_layer=norm_layer, act_layer=None)
-
-
-def downsample_avg(
-        in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None):
-    """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
-    norm_layer = norm_layer or nn.BatchNorm2d
-    avg_stride = stride if dilation == 1 else 1
-    pool = nn.Identity()
-    if stride > 1 or dilation > 1:
-        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-    return nn.Sequential(*[
-        pool, ConvBnAct(in_chs, out_chs, 1, stride=1, norm_layer=norm_layer, act_layer=None)])
-
-
-class RegStage(nn.Module):
-    """Stage (sequence of blocks w/ the same output shape)."""
-
-    def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio, group_width,
-                 block_fn=Bottleneck, se_ratio=0., drop_path_rates=None, drop_block=None):
-        super(RegStage, self).__init__()
-        block_kwargs = {}  # FIXME setup to pass various aa, norm, act layer common args
-        first_dilation = 1 if dilation in (1, 2) else 2
-        for i in range(depth):
-            block_stride = stride if i == 0 else 1
-            block_in_chs = in_chs if i == 0 else out_chs
-            block_dilation = first_dilation if i == 0 else dilation
-            if drop_path_rates is not None and drop_path_rates[i] > 0.:
-                drop_path = DropPath(drop_path_rates[i])
-            else:
-                drop_path = None
-            if (block_in_chs != out_chs) or (block_stride != 1):
-                proj_block = downsample_conv(block_in_chs, out_chs, 1, block_stride, block_dilation)
-            else:
-                proj_block = None
-
-            name = "b{}".format(i + 1)
-            self.add_module(
-                name, block_fn(
-                    block_in_chs, out_chs, block_stride, block_dilation, bottle_ratio, group_width, se_ratio,
-                    downsample=proj_block, drop_block=drop_block, drop_path=drop_path, **block_kwargs)
-            )
-
-    def forward(self, x):
-        for block in self.children():
-            x = block(x)
-        return x
-
-
-class RegNet(nn.Module):
-    """RegNet model.
-
-    Paper: https://arxiv.org/abs/2003.13678
-    Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
-    """
-
-    def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg', drop_rate=0.,
-                 drop_path_rate=0., zero_init_last_bn=True):
-        super().__init__()
-        # TODO add drop block, drop path, anti-aliasing, custom bn/act args
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        assert output_stride in (8, 16, 32)
-
-        # Construct the stem
-        stem_width = cfg['stem_width']
-        self.stem = ConvBnAct(in_chans, stem_width, 3, stride=2)
-        self.feature_info = [dict(num_chs=stem_width, reduction=2, module='stem')]
-
-        # Construct the stages
-        prev_width = stem_width
-        curr_stride = 2
-        stage_params = self._get_stage_params(cfg, output_stride=output_stride, drop_path_rate=drop_path_rate)
-        se_ratio = cfg['se_ratio']
-        for i, stage_args in enumerate(stage_params):
-            stage_name = "s{}".format(i + 1)
-            self.add_module(stage_name, RegStage(prev_width, **stage_args, se_ratio=se_ratio))
-            prev_width = stage_args['out_chs']
-            curr_stride *= stage_args['stride']
-            self.feature_info += [dict(num_chs=prev_width, reduction=curr_stride, module=stage_name)]
-
-        # Construct the head
-        self.num_features = prev_width
-        self.head = ClassifierHead(
-            in_chs=prev_width, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, mean=0.0, std=0.01)
-                nn.init.zeros_(m.bias)
-        if zero_init_last_bn:
-            for m in self.modules():
-                if hasattr(m, 'zero_init_last_bn'):
-                    m.zero_init_last_bn()
-
-    def _get_stage_params(self, cfg, default_stride=2, output_stride=32, drop_path_rate=0.):
-        # Generate RegNet ws per block
-        w_a, w_0, w_m, d = cfg['wa'], cfg['w0'], cfg['wm'], cfg['depth']
-        widths, num_stages, _, _ = generate_regnet(w_a, w_0, w_m, d)
-
-        # Convert to per stage format
-        stage_widths, stage_depths = np.unique(widths, return_counts=True)
-
-        # Use the same group width, bottleneck mult and stride for each stage
-        stage_groups = [cfg['group_w'] for _ in range(num_stages)]
-        stage_bottle_ratios = [cfg['bottle_ratio'] for _ in range(num_stages)]
-        stage_strides = []
-        stage_dilations = []
-        net_stride = 2
-        dilation = 1
-        for _ in range(num_stages):
-            if net_stride >= output_stride:
-                dilation *= default_stride
-                stride = 1
-            else:
-                stride = default_stride
-                net_stride *= stride
-            stage_strides.append(stride)
-            stage_dilations.append(dilation)
-        stage_dpr = np.split(np.linspace(0, drop_path_rate, d), np.cumsum(stage_depths[:-1]))
-
-        # Adjust the compatibility of ws and gws
-        stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths, stage_bottle_ratios, stage_groups)
-        param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rates']
-        stage_params = [
-            dict(zip(param_names, params)) for params in
-            zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_bottle_ratios, stage_groups,
-                stage_dpr)]
-        return stage_params
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        for block in list(self.children())[:-1]:
-            x = block(x)
-        return x
-
-    def forward(self, x):
-        for block in self.children():
-            x = block(x)
-        return x
-
-
-def _filter_fn(state_dict):
-    """ convert patch embedding weight from manual patchify + linear proj to conv"""
-    if 'model' in state_dict:
-        # For DeiT trained regnety_160 pretraiend model
-        state_dict = state_dict['model']
-    return state_dict
-
-
-def _create_regnet(variant, pretrained, **kwargs):
-    return build_model_with_cfg(
-        RegNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=model_cfgs[variant],
-        pretrained_filter_fn=_filter_fn,
-        **kwargs)
-
-
-@register_model
-def regnetx_002(pretrained=False, **kwargs):
-    """RegNetX-200MF"""
-    return _create_regnet('regnetx_002', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_004(pretrained=False, **kwargs):
-    """RegNetX-400MF"""
-    return _create_regnet('regnetx_004', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_006(pretrained=False, **kwargs):
-    """RegNetX-600MF"""
-    return _create_regnet('regnetx_006', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_008(pretrained=False, **kwargs):
-    """RegNetX-800MF"""
-    return _create_regnet('regnetx_008', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_016(pretrained=False, **kwargs):
-    """RegNetX-1.6GF"""
-    return _create_regnet('regnetx_016', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_032(pretrained=False, **kwargs):
-    """RegNetX-3.2GF"""
-    return _create_regnet('regnetx_032', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_040(pretrained=False, **kwargs):
-    """RegNetX-4.0GF"""
-    return _create_regnet('regnetx_040', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_064(pretrained=False, **kwargs):
-    """RegNetX-6.4GF"""
-    return _create_regnet('regnetx_064', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_080(pretrained=False, **kwargs):
-    """RegNetX-8.0GF"""
-    return _create_regnet('regnetx_080', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_120(pretrained=False, **kwargs):
-    """RegNetX-12GF"""
-    return _create_regnet('regnetx_120', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_160(pretrained=False, **kwargs):
-    """RegNetX-16GF"""
-    return _create_regnet('regnetx_160', pretrained, **kwargs)
-
-
-@register_model
-def regnetx_320(pretrained=False, **kwargs):
-    """RegNetX-32GF"""
-    return _create_regnet('regnetx_320', pretrained, **kwargs)
-
-
-@register_model
-def regnety_002(pretrained=False, **kwargs):
-    """RegNetY-200MF"""
-    return _create_regnet('regnety_002', pretrained, **kwargs)
-
-
-@register_model
-def regnety_004(pretrained=False, **kwargs):
-    """RegNetY-400MF"""
-    return _create_regnet('regnety_004', pretrained, **kwargs)
-
-
-@register_model
-def regnety_006(pretrained=False, **kwargs):
-    """RegNetY-600MF"""
-    return _create_regnet('regnety_006', pretrained, **kwargs)
-
-
-@register_model
-def regnety_008(pretrained=False, **kwargs):
-    """RegNetY-800MF"""
-    return _create_regnet('regnety_008', pretrained, **kwargs)
-
-
-@register_model
-def regnety_016(pretrained=False, **kwargs):
-    """RegNetY-1.6GF"""
-    return _create_regnet('regnety_016', pretrained, **kwargs)
-
-
-@register_model
-def regnety_032(pretrained=False, **kwargs):
-    """RegNetY-3.2GF"""
-    return _create_regnet('regnety_032', pretrained, **kwargs)
-
-
-@register_model
-def regnety_040(pretrained=False, **kwargs):
-    """RegNetY-4.0GF"""
-    return _create_regnet('regnety_040', pretrained, **kwargs)
-
-
-@register_model
-def regnety_064(pretrained=False, **kwargs):
-    """RegNetY-6.4GF"""
-    return _create_regnet('regnety_064', pretrained, **kwargs)
-
-
-@register_model
-def regnety_080(pretrained=False, **kwargs):
-    """RegNetY-8.0GF"""
-    return _create_regnet('regnety_080', pretrained, **kwargs)
-
-
-@register_model
-def regnety_120(pretrained=False, **kwargs):
-    """RegNetY-12GF"""
-    return _create_regnet('regnety_120', pretrained, **kwargs)
-
-
-@register_model
-def regnety_160(pretrained=False, **kwargs):
-    """RegNetY-16GF"""
-    return _create_regnet('regnety_160', pretrained, **kwargs)
-
-
-@register_model
-def regnety_320(pretrained=False, **kwargs):
-    """RegNetY-32GF"""
-    return _create_regnet('regnety_320', pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/res2net.py b/AVLFormer/src/timm/models/res2net.py
deleted file mode 100644
index 183551e..0000000
--- a/AVLFormer/src/timm/models/res2net.py
+++ /dev/null
@@ -1,216 +0,0 @@
-""" Res2Net and Res2NeXt
-Adapted from Official Pytorch impl at: https://github.com/gasvn/Res2Net/
-Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169
-"""
-import math
-
-import torch
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .registry import register_model
-from .resnet import ResNet
-
-__all__ = []
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'res2net50_26w_4s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_4s-06e79181.pth'),
-    'res2net50_48w_2s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_48w_2s-afed724a.pth'),
-    'res2net50_14w_8s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_14w_8s-6527dddc.pth'),
-    'res2net50_26w_6s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_6s-19041792.pth'),
-    'res2net50_26w_8s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_8s-2c7c9f12.pth'),
-    'res2net101_26w_4s': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net101_26w_4s-02a759a1.pth'),
-    'res2next50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next50_4s-6ef7e7bf.pth'),
-}
-
-
-class Bottle2neck(nn.Module):
-    """ Res2Net/Res2NeXT Bottleneck
-    Adapted from https://github.com/gasvn/Res2Net/blob/master/res2net.py
-    """
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=26, scale=4, dilation=1, first_dilation=None,
-                 act_layer=nn.ReLU, norm_layer=None, attn_layer=None, **_):
-        super(Bottle2neck, self).__init__()
-        self.scale = scale
-        self.is_first = stride > 1 or downsample is not None
-        self.num_scales = max(1, scale - 1)
-        width = int(math.floor(planes * (base_width / 64.0))) * cardinality
-        self.width = width
-        outplanes = planes * self.expansion
-        first_dilation = first_dilation or dilation
-
-        self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(width * scale)
-
-        convs = []
-        bns = []
-        for i in range(self.num_scales):
-            convs.append(nn.Conv2d(
-                width, width, kernel_size=3, stride=stride, padding=first_dilation,
-                dilation=first_dilation, groups=cardinality, bias=False))
-            bns.append(norm_layer(width))
-        self.convs = nn.ModuleList(convs)
-        self.bns = nn.ModuleList(bns)
-        if self.is_first:
-            # FIXME this should probably have count_include_pad=False, but hurts original weights
-            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
-        else:
-            self.pool = None
-
-        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(outplanes)
-        self.se = attn_layer(outplanes) if attn_layer is not None else None
-
-        self.relu = act_layer(inplace=True)
-        self.downsample = downsample
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.bn3.weight)
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        spx = torch.split(out, self.width, 1)
-        spo = []
-        sp = spx[0]  # redundant, for torchscript
-        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            if i == 0 or self.is_first:
-                sp = spx[i]
-            else:
-                sp = sp + spx[i]
-            sp = conv(sp)
-            sp = bn(sp)
-            sp = self.relu(sp)
-            spo.append(sp)
-        if self.scale > 1:
-            if self.pool is not None:
-                # self.is_first == True, None check for torchscript
-                spo.append(self.pool(spx[-1]))
-            else:
-                spo.append(spx[-1])
-        out = torch.cat(spo, 1)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.se is not None:
-            out = self.se(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.relu(out)
-
-        return out
-
-
-def _create_res2net(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def res2net50_26w_4s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 26w4s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=4), **kwargs)
-    return _create_res2net('res2net50_26w_4s', pretrained, **model_args)
-
-
-@register_model
-def res2net101_26w_4s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-101 26w4s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 23, 3], base_width=26, block_args=dict(scale=4), **kwargs)
-    return _create_res2net('res2net101_26w_4s', pretrained, **model_args)
-
-
-@register_model
-def res2net50_26w_6s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 26w6s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=6), **kwargs)
-    return _create_res2net('res2net50_26w_6s', pretrained, **model_args)
-
-
-@register_model
-def res2net50_26w_8s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 26w8s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=8), **kwargs)
-    return _create_res2net('res2net50_26w_8s', pretrained, **model_args)
-
-
-@register_model
-def res2net50_48w_2s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 48w2s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=48, block_args=dict(scale=2), **kwargs)
-    return _create_res2net('res2net50_48w_2s', pretrained, **model_args)
-
-
-@register_model
-def res2net50_14w_8s(pretrained=False, **kwargs):
-    """Constructs a Res2Net-50 14w8s model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=14, block_args=dict(scale=8), **kwargs)
-    return _create_res2net('res2net50_14w_8s', pretrained, **model_args)
-
-
-@register_model
-def res2next50(pretrained=False, **kwargs):
-    """Construct Res2NeXt-50 4s
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model_args = dict(
-        block=Bottle2neck, layers=[3, 4, 6, 3], base_width=4, cardinality=8, block_args=dict(scale=4), **kwargs)
-    return _create_res2net('res2next50', pretrained, **model_args)
diff --git a/AVLFormer/src/timm/models/resnest.py b/AVLFormer/src/timm/models/resnest.py
deleted file mode 100644
index 694338d..0000000
--- a/AVLFormer/src/timm/models/resnest.py
+++ /dev/null
@@ -1,238 +0,0 @@
-""" ResNeSt Models
-
-Paper: `ResNeSt: Split-Attention Networks` - https://arxiv.org/abs/2004.08955
-
-Adapted from original PyTorch impl w/ weights at https://github.com/zhanghang1989/ResNeSt by Hang Zhang
-
-Modified for torchscript compat, and consistency with timm by Ross Wightman
-"""
-import torch
-from torch import nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import SplitAttnConv2d
-from .registry import register_model
-from .resnet import ResNet
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1.0', 'classifier': 'fc',
-        **kwargs
-    }
-
-default_cfgs = {
-    'resnest14d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest14-9c8fe254.pth'),
-    'resnest26d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest26-50eb607c.pth'),
-    'resnest50d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50-528c19ca.pth'),
-    'resnest101e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest101-22405ba7.pth',
-        input_size=(3, 256, 256), pool_size=(8, 8)),
-    'resnest200e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest200-75117900.pth',
-        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=0.909, interpolation='bicubic'),
-    'resnest269e': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest269-0cc87c48.pth',
-        input_size=(3, 416, 416), pool_size=(13, 13), crop_pct=0.928, interpolation='bicubic'),
-    'resnest50d_4s2x40d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_4s2x40d-41d14ed0.pth',
-        interpolation='bicubic'),
-    'resnest50d_1s4x24d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_1s4x24d-d4a4f76f.pth',
-        interpolation='bicubic')
-}
-
-
-class ResNestBottleneck(nn.Module):
-    """ResNet Bottleneck
-    """
-    # pylint: disable=unused-argument
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 radix=1, cardinality=1, base_width=64, avd=False, avd_first=False, is_first=False,
-                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
-        super(ResNestBottleneck, self).__init__()
-        assert reduce_first == 1  # not supported
-        assert attn_layer is None  # not supported
-        assert aa_layer is None  # TODO not yet supported
-        assert drop_path is None  # TODO not yet supported
-
-        group_width = int(planes * (base_width / 64.)) * cardinality
-        first_dilation = first_dilation or dilation
-        if avd and (stride > 1 or is_first):
-            avd_stride = stride
-            stride = 1
-        else:
-            avd_stride = 0
-        self.radix = radix
-        self.drop_block = drop_block
-
-        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(group_width)
-        self.act1 = act_layer(inplace=True)
-        self.avd_first = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and avd_first else None
-
-        if self.radix >= 1:
-            self.conv2 = SplitAttnConv2d(
-                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
-                dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_block=drop_block)
-            self.bn2 = None  # FIXME revisit, here to satisfy current torchscript fussyness
-            self.act2 = None
-        else:
-            self.conv2 = nn.Conv2d(
-                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
-                dilation=first_dilation, groups=cardinality, bias=False)
-            self.bn2 = norm_layer(group_width)
-            self.act2 = act_layer(inplace=True)
-        self.avd_last = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and not avd_first else None
-
-        self.conv3 = nn.Conv2d(group_width, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(planes*4)
-        self.act3 = act_layer(inplace=True)
-        self.downsample = downsample
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.bn3.weight)
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        if self.drop_block is not None:
-            out = self.drop_block(out)
-        out = self.act1(out)
-
-        if self.avd_first is not None:
-            out = self.avd_first(out)
-
-        out = self.conv2(out)
-        if self.bn2 is not None:
-            out = self.bn2(out)
-            if self.drop_block is not None:
-                out = self.drop_block(out)
-            out = self.act2(out)
-
-        if self.avd_last is not None:
-            out = self.avd_last(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-        if self.drop_block is not None:
-            out = self.drop_block(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-        out = self.act3(out)
-        return out
-
-
-def _create_resnest(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def resnest14d(pretrained=False, **kwargs):
-    """ ResNeSt-14d model. Weights ported from GluonCV.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[1, 1, 1, 1],
-        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest14d', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest26d(pretrained=False, **kwargs):
-    """ ResNeSt-26d model. Weights ported from GluonCV.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[2, 2, 2, 2],
-        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest26d', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest50d(pretrained=False, **kwargs):
-    """ ResNeSt-50d model. Matches paper ResNeSt-50 model, https://arxiv.org/abs/2004.08955
-    Since this codebase supports all possible variations, 'd' for deep stem, stem_width 32, avg in downsample.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 4, 6, 3],
-        stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest50d', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest101e(pretrained=False, **kwargs):
-    """ ResNeSt-101e model. Matches paper ResNeSt-101 model, https://arxiv.org/abs/2004.08955
-     Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 4, 23, 3],
-        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest101e', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest200e(pretrained=False, **kwargs):
-    """ ResNeSt-200e model. Matches paper ResNeSt-200 model, https://arxiv.org/abs/2004.08955
-    Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 24, 36, 3],
-        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest200e', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest269e(pretrained=False, **kwargs):
-    """ ResNeSt-269e model. Matches paper ResNeSt-269 model, https://arxiv.org/abs/2004.08955
-    Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample.
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 30, 48, 8],
-        stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1,
-        block_args=dict(radix=2, avd=True, avd_first=False), **kwargs)
-    return _create_resnest('resnest269e', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest50d_4s2x40d(pretrained=False, **kwargs):
-    """ResNeSt-50 4s2x40d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 4, 6, 3],
-        stem_type='deep', stem_width=32, avg_down=True, base_width=40, cardinality=2,
-        block_args=dict(radix=4, avd=True, avd_first=True), **kwargs)
-    return _create_resnest('resnest50d_4s2x40d', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def resnest50d_1s4x24d(pretrained=False, **kwargs):
-    """ResNeSt-50 1s4x24d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md
-    """
-    model_kwargs = dict(
-        block=ResNestBottleneck, layers=[3, 4, 6, 3],
-        stem_type='deep', stem_width=32, avg_down=True, base_width=24, cardinality=4,
-        block_args=dict(radix=1, avd=True, avd_first=True), **kwargs)
-    return _create_resnest('resnest50d_1s4x24d', pretrained=pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/resnet.py b/AVLFormer/src/timm/models/resnet.py
deleted file mode 100644
index 0a7a948..0000000
--- a/AVLFormer/src/timm/models/resnet.py
+++ /dev/null
@@ -1,1297 +0,0 @@
-"""PyTorch ResNet
-
-This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with
-additional dropout and dynamic global avg/max pool.
-
-ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman
-Copyright 2020 Ross Wightman
-"""
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import DropBlock2d, DropPath, AvgPool2dSame, BlurPool2d, create_attn, create_classifier
-from .registry import register_model
-
-__all__ = ['ResNet', 'BasicBlock', 'Bottleneck']  # model_registry will add each entrypoint fn to this
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # ResNet and Wide ResNet
-    'resnet18': _cfg(url='https://download.pytorch.org/models/resnet18-5c106cde.pth'),
-    'resnet18d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet18d_ra2-48a79e06.pth',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'resnet34': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34-43635321.pth'),
-    'resnet34d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34d_ra2-f8dcfcaf.pth',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'resnet26': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26-9aa10e23.pth',
-        interpolation='bicubic'),
-    'resnet26d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26d-69e92c46.pth',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'resnet50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50_ram-a26f946b.pth',
-        interpolation='bicubic'),
-    'resnet50d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'resnet101': _cfg(url='', interpolation='bicubic'),
-    'resnet101d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet101d_ra2-2803ffab.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=1.0, test_input_size=(3, 320, 320)),
-    'resnet152': _cfg(url='', interpolation='bicubic'),
-    'resnet152d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet152d_ra2-5cac0439.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=1.0, test_input_size=(3, 320, 320)),
-    'resnet200': _cfg(url='', interpolation='bicubic'),
-    'resnet200d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet200d_ra2-bdba9bf9.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=1.0, test_input_size=(3, 320, 320)),
-    'tv_resnet34': _cfg(url='https://download.pytorch.org/models/resnet34-333f7ec4.pth'),
-    'tv_resnet50': _cfg(url='https://download.pytorch.org/models/resnet50-19c8e357.pth'),
-    'tv_resnet101': _cfg(url='https://download.pytorch.org/models/resnet101-5d3b4d8f.pth'),
-    'tv_resnet152': _cfg(url='https://download.pytorch.org/models/resnet152-b121ed2d.pth'),
-    'wide_resnet50_2': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/wide_resnet50_racm-8234f177.pth',
-        interpolation='bicubic'),
-    'wide_resnet101_2': _cfg(url='https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth'),
-
-    # ResNeXt
-    'resnext50_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnext50_32x4d_ra-d733960d.pth',
-        interpolation='bicubic'),
-    'resnext50d_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnext50d_32x4d-103e99f8.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'resnext101_32x4d': _cfg(url=''),
-    'resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth'),
-    'resnext101_64x4d': _cfg(url=''),
-    'tv_resnext50_32x4d': _cfg(url='https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth'),
-
-    #  ResNeXt models - Weakly Supervised Pretraining on Instagram Hashtags
-    #  from https://github.com/facebookresearch/WSL-Images
-    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
-    'ig_resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x8-c38310e5.pth'),
-    'ig_resnext101_32x16d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x16-c6f796b0.pth'),
-    'ig_resnext101_32x32d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x32-e4b90b00.pth'),
-    'ig_resnext101_32x48d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x48-3e41cc8a.pth'),
-
-    #  Semi-Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models
-    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
-    'ssl_resnet18':  _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet18-d92f0530.pth'),
-    'ssl_resnet50':  _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet50-08389792.pth'),
-    'ssl_resnext50_32x4d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext50_32x4-ddb3e555.pth'),
-    'ssl_resnext101_32x4d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x4-dc43570a.pth'),
-    'ssl_resnext101_32x8d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x8-2cfe2f8b.pth'),
-    'ssl_resnext101_32x16d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x16-15fffa57.pth'),
-
-    #  Semi-Weakly Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models
-    #  Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only.
-    'swsl_resnet18': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet18-118f1556.pth'),
-    'swsl_resnet50': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet50-16a12f1b.pth'),
-    'swsl_resnext50_32x4d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext50_32x4-72679e44.pth'),
-    'swsl_resnext101_32x4d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x4-3f87e46b.pth'),
-    'swsl_resnext101_32x8d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x8-b4712904.pth'),
-    'swsl_resnext101_32x16d': _cfg(
-        url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x16-f3559a9c.pth'),
-
-    #  Squeeze-Excitation ResNets, to eventually replace the models in senet.py
-    'seresnet18': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'seresnet34': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'seresnet50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet50_ra_224-8efdb4bb.pth',
-        interpolation='bicubic'),
-    'seresnet50t': _cfg(
-        url='',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'seresnet101': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'seresnet152': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'seresnet152d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet152d_ra2-04464dd2.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=1.0, test_input_size=(3, 320, 320)
-    ),
-    'seresnet200d': _cfg(
-        url='',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
-    'seresnet269d': _cfg(
-        url='',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
-
-
-    #  Squeeze-Excitation ResNeXts, to eventually replace the models in senet.py
-    'seresnext26d_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26d_32x4d-80fa48a3.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'seresnext26t_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26tn_32x4d-569cb627.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'seresnext50_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext50_32x4d_racm-a304a460.pth',
-        interpolation='bicubic'),
-    'seresnext101_32x4d': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'seresnext101_32x8d': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'senet154': _cfg(
-        url='',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-
-    # Efficient Channel Attention ResNets
-    'ecaresnet26t': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet26t_ra2-46609757.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=0.95, test_input_size=(3, 320, 320)),
-    'ecaresnetlight': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45402/outputs/ECAResNetLight_4f34b35b.pth',
-        interpolation='bicubic'),
-    'ecaresnet50d': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45402/outputs/ECAResNet50D_833caf58.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'ecaresnet50d_pruned': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45899/outputs/ECAResNet50D_P_9c67f710.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'ecaresnet50t': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet50t_ra2-f7ac63c4.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8),
-        crop_pct=0.95, test_input_size=(3, 320, 320)),
-    'ecaresnet101d': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45402/outputs/ECAResNet101D_281c5844.pth',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'ecaresnet101d_pruned': _cfg(
-        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45610/outputs/ECAResNet101D_P_75a3370e.pth',
-        interpolation='bicubic',
-        first_conv='conv1.0'),
-    'ecaresnet200d': _cfg(
-        url='',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)),
-    'ecaresnet269d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet269d_320_ra2-7baa55cb.pth',
-        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 320, 320), pool_size=(10, 10),
-        crop_pct=1.0, test_input_size=(3, 352, 352)),
-
-    # Efficient Channel Attention ResNeXts
-    'ecaresnext26t_32x4d': _cfg(
-        url='',
-        interpolation='bicubic', first_conv='conv1.0'),
-    'ecaresnext50t_32x4d': _cfg(
-        url='',
-        interpolation='bicubic', first_conv='conv1.0'),
-
-    # ResNets with anti-aliasing blur pool
-    'resnetblur18': _cfg(
-        interpolation='bicubic'),
-    'resnetblur50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnetblur50-84f4748f.pth',
-        interpolation='bicubic')
-}
-
-
-def get_padding(kernel_size, stride, dilation=1):
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
-                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
-        super(BasicBlock, self).__init__()
-
-        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
-        assert base_width == 64, 'BasicBlock does not support changing base width'
-        first_planes = planes // reduce_first
-        outplanes = planes * self.expansion
-        first_dilation = first_dilation or dilation
-        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
-
-        self.conv1 = nn.Conv2d(
-            inplanes, first_planes, kernel_size=3, stride=1 if use_aa else stride, padding=first_dilation,
-            dilation=first_dilation, bias=False)
-        self.bn1 = norm_layer(first_planes)
-        self.act1 = act_layer(inplace=True)
-        self.aa = aa_layer(channels=first_planes, stride=stride) if use_aa else None
-
-        self.conv2 = nn.Conv2d(
-            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
-        self.bn2 = norm_layer(outplanes)
-
-        self.se = create_attn(attn_layer, outplanes)
-
-        self.act2 = act_layer(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.bn2.weight)
-
-    def forward(self, x):
-        residual = x
-
-        x = self.conv1(x)
-        x = self.bn1(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-        x = self.act1(x)
-        if self.aa is not None:
-            x = self.aa(x)
-
-        x = self.conv2(x)
-        x = self.bn2(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-
-        if self.se is not None:
-            x = self.se(x)
-
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-        x += residual
-        x = self.act2(x)
-
-        return x
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
-                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
-        super(Bottleneck, self).__init__()
-
-        width = int(math.floor(planes * (base_width / 64)) * cardinality)
-        first_planes = width // reduce_first
-        outplanes = planes * self.expansion
-        first_dilation = first_dilation or dilation
-        use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
-
-        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
-        self.bn1 = norm_layer(first_planes)
-        self.act1 = act_layer(inplace=True)
-
-        self.conv2 = nn.Conv2d(
-            first_planes, width, kernel_size=3, stride=1 if use_aa else stride,
-            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
-        self.bn2 = norm_layer(width)
-        self.act2 = act_layer(inplace=True)
-        self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
-
-        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
-        self.bn3 = norm_layer(outplanes)
-
-        self.se = create_attn(attn_layer, outplanes)
-
-        self.act3 = act_layer(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.bn3.weight)
-
-    def forward(self, x):
-        residual = x
-
-        x = self.conv1(x)
-        x = self.bn1(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-        x = self.act1(x)
-
-        x = self.conv2(x)
-        x = self.bn2(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-        x = self.act2(x)
-        if self.aa is not None:
-            x = self.aa(x)
-
-        x = self.conv3(x)
-        x = self.bn3(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-
-        if self.se is not None:
-            x = self.se(x)
-
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-        x += residual
-        x = self.act3(x)
-
-        return x
-
-
-def downsample_conv(
-        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
-    norm_layer = norm_layer or nn.BatchNorm2d
-    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
-    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
-    p = get_padding(kernel_size, stride, first_dilation)
-
-    return nn.Sequential(*[
-        nn.Conv2d(
-            in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
-        norm_layer(out_channels)
-    ])
-
-
-def downsample_avg(
-        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
-    norm_layer = norm_layer or nn.BatchNorm2d
-    avg_stride = stride if dilation == 1 else 1
-    if stride == 1 and dilation == 1:
-        pool = nn.Identity()
-    else:
-        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-
-    return nn.Sequential(*[
-        pool,
-        nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
-        norm_layer(out_channels)
-    ])
-
-
-def drop_blocks(drop_block_rate=0.):
-    return [
-        None, None,
-        DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
-        DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None]
-
-
-def make_blocks(
-        block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
-        down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
-    stages = []
-    feature_info = []
-    net_num_blocks = sum(block_repeats)
-    net_block_idx = 0
-    net_stride = 4
-    dilation = prev_dilation = 1
-    for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
-        stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
-        stride = 1 if stage_idx == 0 else 2
-        if net_stride >= output_stride:
-            dilation *= stride
-            stride = 1
-        else:
-            net_stride *= stride
-
-        downsample = None
-        if stride != 1 or inplanes != planes * block_fn.expansion:
-            down_kwargs = dict(
-                in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
-                stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
-            downsample = downsample_avg(**down_kwargs) if avg_down else downsample_conv(**down_kwargs)
-
-        block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
-        blocks = []
-        for block_idx in range(num_blocks):
-            downsample = downsample if block_idx == 0 else None
-            stride = stride if block_idx == 0 else 1
-            block_dpr = drop_path_rate * net_block_idx / (net_num_blocks - 1)  # stochastic depth linear decay rule
-            blocks.append(block_fn(
-                inplanes, planes, stride, downsample, first_dilation=prev_dilation,
-                drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
-            prev_dilation = dilation
-            inplanes = planes * block_fn.expansion
-            net_block_idx += 1
-
-        stages.append((stage_name, nn.Sequential(*blocks)))
-        feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
-
-    return stages, feature_info
-
-
-class ResNet(nn.Module):
-    """ResNet / ResNeXt / SE-ResNeXt / SE-Net
-
-    This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
-      * have > 1 stride in the 3x3 conv layer of bottleneck
-      * have conv-bn-act ordering
-
-    This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
-    variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
-    'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
-
-    ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
-      * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
-      * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
-      * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
-      * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
-      * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
-      * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
-      * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
-
-    ResNeXt
-      * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
-      * same c,d, e, s variants as ResNet can be enabled
-
-    SE-ResNeXt
-      * normal - 7x7 stem, stem_width = 64
-      * same c, d, e, s variants as ResNet can be enabled
-
-    SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
-        reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
-
-    Parameters
-    ----------
-    block : Block
-        Class for the residual block. Options are BasicBlockGl, BottleneckGl.
-    layers : list of int
-        Numbers of layers in each block
-    num_classes : int, default 1000
-        Number of classification classes.
-    in_chans : int, default 3
-        Number of input (color) channels.
-    cardinality : int, default 1
-        Number of convolution groups for 3x3 conv in Bottleneck.
-    base_width : int, default 64
-        Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
-    stem_width : int, default 64
-        Number of channels in stem convolutions
-    stem_type : str, default ''
-        The type of stem:
-          * '', default - a single 7x7 conv with a width of stem_width
-          * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
-          * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
-    block_reduce_first: int, default 1
-        Reduction factor for first convolution output width of residual blocks,
-        1 for all archs except senets, where 2
-    down_kernel_size: int, default 1
-        Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
-    avg_down : bool, default False
-        Whether to use average pooling for projection skip connection between stages/downsample.
-    output_stride : int, default 32
-        Set the output stride of the network, 32, 16, or 8. Typically used in segmentation.
-    act_layer : nn.Module, activation layer
-    norm_layer : nn.Module, normalization layer
-    aa_layer : nn.Module, anti-aliasing layer
-    drop_rate : float, default 0.
-        Dropout probability before classifier, for training
-    global_pool : str, default 'avg'
-        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
-    """
-
-    def __init__(self, block, layers, num_classes=1000, in_chans=3,
-                 cardinality=1, base_width=64, stem_width=64, stem_type='',
-                 output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
-                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None):
-        block_args = block_args or dict()
-        assert output_stride in (8, 16, 32)
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        super(ResNet, self).__init__()
-
-        # Stem
-        deep_stem = 'deep' in stem_type
-        inplanes = stem_width * 2 if deep_stem else 64
-        if deep_stem:
-            stem_chs = (stem_width, stem_width)
-            if 'tiered' in stem_type:
-                stem_chs = (3 * (stem_width // 4), stem_width)
-            self.conv1 = nn.Sequential(*[
-                nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
-                norm_layer(stem_chs[0]),
-                act_layer(inplace=True),
-                nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
-                norm_layer(stem_chs[1]),
-                act_layer(inplace=True),
-                nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
-        else:
-            self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = norm_layer(inplanes)
-        self.act1 = act_layer(inplace=True)
-        self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
-
-        # Stem Pooling
-        if aa_layer is not None:
-            self.maxpool = nn.Sequential(*[
-                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
-                aa_layer(channels=inplanes, stride=2)])
-        else:
-            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        # Feature Blocks
-        channels = [64, 128, 256, 512]
-        stage_modules, stage_feature_info = make_blocks(
-            block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
-            output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
-            down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
-            drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
-        for stage in stage_modules:
-            self.add_module(*stage)  # layer1, layer2, etc
-        self.feature_info.extend(stage_feature_info)
-
-        # Head (Pooling and Classifier)
-        self.num_features = 512 * block.expansion
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-        for n, m in self.named_modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1.)
-                nn.init.constant_(m.bias, 0.)
-        if zero_init_last_bn:
-            for m in self.modules():
-                if hasattr(m, 'zero_init_last_bn'):
-                    m.zero_init_last_bn()
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate:
-            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
-        x = self.fc(x)
-        return x
-
-
-def _create_resnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def resnet18(pretrained=False, **kwargs):
-    """Constructs a ResNet-18 model.
-    """
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
-    return _create_resnet('resnet18', pretrained, **model_args)
-
-
-@register_model
-def resnet18d(pretrained=False, **kwargs):
-    """Constructs a ResNet-18-D model.
-    """
-    model_args = dict(
-        block=BasicBlock, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet18d', pretrained, **model_args)
-
-
-@register_model
-def resnet34(pretrained=False, **kwargs):
-    """Constructs a ResNet-34 model.
-    """
-    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs)
-    return _create_resnet('resnet34', pretrained, **model_args)
-
-
-@register_model
-def resnet34d(pretrained=False, **kwargs):
-    """Constructs a ResNet-34-D model.
-    """
-    model_args = dict(
-        block=BasicBlock, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet34d', pretrained, **model_args)
-
-
-@register_model
-def resnet26(pretrained=False, **kwargs):
-    """Constructs a ResNet-26 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], **kwargs)
-    return _create_resnet('resnet26', pretrained, **model_args)
-
-
-@register_model
-def resnet26d(pretrained=False, **kwargs):
-    """Constructs a ResNet-26-D model.
-    """
-    model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet26d', pretrained, **model_args)
-
-
-@register_model
-def resnet50(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
-    return _create_resnet('resnet50', pretrained, **model_args)
-
-
-@register_model
-def resnet50d(pretrained=False, **kwargs):
-    """Constructs a ResNet-50-D model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet50d', pretrained, **model_args)
-
-
-@register_model
-def resnet101(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs)
-    return _create_resnet('resnet101', pretrained, **model_args)
-
-
-@register_model
-def resnet101d(pretrained=False, **kwargs):
-    """Constructs a ResNet-101-D model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet101d', pretrained, **model_args)
-
-
-@register_model
-def resnet152(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs)
-    return _create_resnet('resnet152', pretrained, **model_args)
-
-
-@register_model
-def resnet152d(pretrained=False, **kwargs):
-    """Constructs a ResNet-152-D model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet152d', pretrained, **model_args)
-
-
-@register_model
-def resnet200(pretrained=False, **kwargs):
-    """Constructs a ResNet-200 model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 24, 36, 3], **kwargs)
-    return _create_resnet('resnet200', pretrained, **model_args)
-
-
-@register_model
-def resnet200d(pretrained=False, **kwargs):
-    """Constructs a ResNet-200-D model.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnet200d', pretrained, **model_args)
-
-
-@register_model
-def tv_resnet34(pretrained=False, **kwargs):
-    """Constructs a ResNet-34 model with original Torchvision weights.
-    """
-    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs)
-    return _create_resnet('tv_resnet34', pretrained, **model_args)
-
-
-@register_model
-def tv_resnet50(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model with original Torchvision weights.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
-    return _create_resnet('tv_resnet50', pretrained, **model_args)
-
-
-@register_model
-def tv_resnet101(pretrained=False, **kwargs):
-    """Constructs a ResNet-101 model w/ Torchvision pretrained weights.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs)
-    return _create_resnet('tv_resnet101', pretrained, **model_args)
-
-
-@register_model
-def tv_resnet152(pretrained=False, **kwargs):
-    """Constructs a ResNet-152 model w/ Torchvision pretrained weights.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs)
-    return _create_resnet('tv_resnet152', pretrained, **model_args)
-
-
-@register_model
-def wide_resnet50_2(pretrained=False, **kwargs):
-    """Constructs a Wide ResNet-50-2 model.
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], base_width=128, **kwargs)
-    return _create_resnet('wide_resnet50_2', pretrained, **model_args)
-
-
-@register_model
-def wide_resnet101_2(pretrained=False, **kwargs):
-    """Constructs a Wide ResNet-101-2 model.
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], base_width=128, **kwargs)
-    return _create_resnet('wide_resnet101_2', pretrained, **model_args)
-
-
-@register_model
-def resnext50_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt50-32x4d model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('resnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def resnext50d_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt50d-32x4d model. ResNext50 w/ deep stem & avg pool downsample
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3],  cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep', avg_down=True, **kwargs)
-    return _create_resnet('resnext50d_32x4d', pretrained, **model_args)
-
-
-@register_model
-def resnext101_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt-101 32x4d model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('resnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def resnext101_32x8d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt-101 32x8d model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
-    return _create_resnet('resnext101_32x8d', pretrained, **model_args)
-
-
-@register_model
-def resnext101_64x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt101-64x4d model.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, **kwargs)
-    return _create_resnet('resnext101_64x4d', pretrained, **model_args)
-
-
-@register_model
-def tv_resnext50_32x4d(pretrained=False, **kwargs):
-    """Constructs a ResNeXt50-32x4d model with original Torchvision weights.
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('tv_resnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def ig_resnext101_32x8d(pretrained=True, **kwargs):
-    """Constructs a ResNeXt-101 32x8 model pre-trained on weakly-supervised data
-    and finetuned on ImageNet from Figure 5 in
-    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
-    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
-    return _create_resnet('ig_resnext101_32x8d', pretrained, **model_args)
-
-
-@register_model
-def ig_resnext101_32x16d(pretrained=True, **kwargs):
-    """Constructs a ResNeXt-101 32x16 model pre-trained on weakly-supervised data
-    and finetuned on ImageNet from Figure 5 in
-    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
-    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
-    return _create_resnet('ig_resnext101_32x16d', pretrained, **model_args)
-
-
-@register_model
-def ig_resnext101_32x32d(pretrained=True, **kwargs):
-    """Constructs a ResNeXt-101 32x32 model pre-trained on weakly-supervised data
-    and finetuned on ImageNet from Figure 5 in
-    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
-    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=32, **kwargs)
-    return _create_resnet('ig_resnext101_32x32d', pretrained, **model_args)
-
-
-@register_model
-def ig_resnext101_32x48d(pretrained=True, **kwargs):
-    """Constructs a ResNeXt-101 32x48 model pre-trained on weakly-supervised data
-    and finetuned on ImageNet from Figure 5 in
-    `"Exploring the Limits of Weakly Supervised Pretraining" <https://arxiv.org/abs/1805.00932>`_
-    Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=48, **kwargs)
-    return _create_resnet('ig_resnext101_32x48d', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnet18(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNet-18 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
-    return _create_resnet('ssl_resnet18', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnet50(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNet-50 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
-    return _create_resnet('ssl_resnet50', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnext50_32x4d(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNeXt-50 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('ssl_resnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnext101_32x4d(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNeXt-101 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('ssl_resnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnext101_32x8d(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNeXt-101 32x8 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
-    return _create_resnet('ssl_resnext101_32x8d', pretrained, **model_args)
-
-
-@register_model
-def ssl_resnext101_32x16d(pretrained=True, **kwargs):
-    """Constructs a semi-supervised ResNeXt-101 32x16 model pre-trained on YFCC100M dataset and finetuned on ImageNet
-    `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-    Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
-    return _create_resnet('ssl_resnext101_32x16d', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnet18(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised Resnet-18 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs)
-    return _create_resnet('swsl_resnet18', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnet50(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised ResNet-50 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3],  **kwargs)
-    return _create_resnet('swsl_resnet50', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnext50_32x4d(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised ResNeXt-50 32x4 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('swsl_resnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnext101_32x4d(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised ResNeXt-101 32x4 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs)
-    return _create_resnet('swsl_resnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnext101_32x8d(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised ResNeXt-101 32x8 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs)
-    return _create_resnet('swsl_resnext101_32x8d', pretrained, **model_args)
-
-
-@register_model
-def swsl_resnext101_32x16d(pretrained=True, **kwargs):
-    """Constructs a semi-weakly supervised ResNeXt-101 32x16 model pre-trained on 1B weakly supervised
-       image dataset and finetuned on ImageNet.
-       `"Billion-scale Semi-Supervised Learning for Image Classification" <https://arxiv.org/abs/1905.00546>`_
-       Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs)
-    return _create_resnet('swsl_resnext101_32x16d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet26t(pretrained=False, **kwargs):
-    """Constructs an ECA-ResNeXt-26-T model.
-    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
-    in the deep stem and ECA attn.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32,
-        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet26t', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet50d(pretrained=False, **kwargs):
-    """Constructs a ResNet-50-D model with eca.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet50d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet50d_pruned(pretrained=False, **kwargs):
-    """Constructs a ResNet-50-D model pruned with eca.
-        The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet50d_pruned', pretrained, pruned=True, **model_args)
-
-
-@register_model
-def ecaresnet50t(pretrained=False, **kwargs):
-    """Constructs an ECA-ResNet-50-T model.
-    Like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels in the deep stem and ECA attn.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32,
-        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet50t', pretrained, **model_args)
-
-
-@register_model
-def ecaresnetlight(pretrained=False, **kwargs):
-    """Constructs a ResNet-50-D light model with eca.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[1, 1, 11, 3], stem_width=32, avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnetlight', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet101d(pretrained=False, **kwargs):
-    """Constructs a ResNet-101-D model with eca.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet101d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet101d_pruned(pretrained=False, **kwargs):
-    """Constructs a ResNet-101-D model pruned with eca.
-       The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet101d_pruned', pretrained, pruned=True, **model_args)
-
-
-@register_model
-def ecaresnet200d(pretrained=False, **kwargs):
-    """Constructs a ResNet-200-D model with ECA.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet200d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnet269d(pretrained=False, **kwargs):
-    """Constructs a ResNet-269-D model with ECA.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnet269d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnext26t_32x4d(pretrained=False, **kwargs):
-    """Constructs an ECA-ResNeXt-26-T model.
-    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
-    in the deep stem. This model replaces SE module with the ECA module
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
-        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnext26t_32x4d', pretrained, **model_args)
-
-
-@register_model
-def ecaresnext50t_32x4d(pretrained=False, **kwargs):
-    """Constructs an ECA-ResNeXt-50-T model.
-    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
-    in the deep stem. This model replaces SE module with the ECA module
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
-        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs)
-    return _create_resnet('ecaresnext50t_32x4d', pretrained, **model_args)
-
-
-@register_model
-def resnetblur18(pretrained=False, **kwargs):
-    """Constructs a ResNet-18 model with blur anti-aliasing
-    """
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], aa_layer=BlurPool2d, **kwargs)
-    return _create_resnet('resnetblur18', pretrained, **model_args)
-
-
-@register_model
-def resnetblur50(pretrained=False, **kwargs):
-    """Constructs a ResNet-50 model with blur anti-aliasing
-    """
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=BlurPool2d, **kwargs)
-    return _create_resnet('resnetblur50', pretrained, **model_args)
-
-
-@register_model
-def seresnet18(pretrained=False, **kwargs):
-    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet18', pretrained, **model_args)
-
-
-@register_model
-def seresnet34(pretrained=False, **kwargs):
-    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet34', pretrained, **model_args)
-
-
-@register_model
-def seresnet50(pretrained=False, **kwargs):
-    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet50', pretrained, **model_args)
-
-
-@register_model
-def seresnet50t(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3],  stem_width=32, stem_type='deep_tiered', avg_down=True,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet50t', pretrained, **model_args)
-
-
-@register_model
-def seresnet101(pretrained=False, **kwargs):
-    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet101', pretrained, **model_args)
-
-
-@register_model
-def seresnet152(pretrained=False, **kwargs):
-    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet152', pretrained, **model_args)
-
-
-@register_model
-def seresnet152d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet152d', pretrained, **model_args)
-
-
-@register_model
-def seresnet200d(pretrained=False, **kwargs):
-    """Constructs a ResNet-200-D model with SE attn.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet200d', pretrained, **model_args)
-
-
-@register_model
-def seresnet269d(pretrained=False, **kwargs):
-    """Constructs a ResNet-269-D model with SE attn.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnet269d', pretrained, **model_args)
-
-
-@register_model
-def seresnext26d_32x4d(pretrained=False, **kwargs):
-    """Constructs a SE-ResNeXt-26-D model.`
-    This is technically a 28 layer ResNet, using the 'D' modifier from Gluon / bag-of-tricks for
-    combination of deep stem and avg_pool in downsample.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
-        stem_type='deep', avg_down=True, block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnext26d_32x4d', pretrained, **model_args)
-
-
-@register_model
-def seresnext26t_32x4d(pretrained=False, **kwargs):
-    """Constructs a SE-ResNet-26-T model.
-    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
-    in the deep stem.
-    """
-    model_args = dict(
-        block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32,
-        stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnext26t_32x4d', pretrained, **model_args)
-
-
-@register_model
-def seresnext26tn_32x4d(pretrained=False, **kwargs):
-    """Constructs a SE-ResNeXt-26-T model.
-    NOTE I deprecated previous 't' model defs and replaced 't' with 'tn', this was the only tn model of note
-    so keeping this def for backwards compat with any uses out there. Old 't' model is lost.
-    """
-    return seresnext26t_32x4d(pretrained=pretrained, **kwargs)
-
-
-@register_model
-def seresnext50_32x4d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def seresnext101_32x4d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnext101_32x4d', pretrained, **model_args)
-
-
-@register_model
-def seresnext101_32x8d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8,
-        block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('seresnext101_32x8d', pretrained, **model_args)
-
-
-@register_model
-def senet154(pretrained=False, **kwargs):
-    model_args = dict(
-        block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep',
-        down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer='se'), **kwargs)
-    return _create_resnet('senet154', pretrained, **model_args)
diff --git a/AVLFormer/src/timm/models/resnetv2.py b/AVLFormer/src/timm/models/resnetv2.py
deleted file mode 100644
index 4a28adc..0000000
--- a/AVLFormer/src/timm/models/resnetv2.py
+++ /dev/null
@@ -1,554 +0,0 @@
-"""Pre-Activation ResNet v2 with GroupNorm and Weight Standardization.
-
-A PyTorch implementation of ResNetV2 adapted from the Google Big-Transfoer (BiT) source code
-at https://github.com/google-research/big_transfer to match timm interfaces. The BiT weights have
-been included here as pretrained models from their original .NPZ checkpoints.
-
-Additionally, supports non pre-activation bottleneck for use as a backbone for Vision Transfomers (ViT) and
-extra padding support to allow porting of official Hybrid ResNet pretrained weights from
-https://github.com/google-research/vision_transformer
-
-Thanks to the Google team for the above two repositories and associated papers:
-* Big Transfer (BiT): General Visual Representation Learning - https://arxiv.org/abs/1912.11370
-* An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - https://arxiv.org/abs/2010.11929
-
-Original copyright of Google code below, modifications by Ross Wightman, Copyright 2020.
-"""
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import OrderedDict  # pylint: disable=g-importing-member
-
-import torch
-import torch.nn as nn
-from functools import partial
-
-from src.timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .helpers import build_model_with_cfg
-from .registry import register_model
-from .layers import GroupNormAct, ClassifierHead, DropPath, AvgPool2dSame, create_pool2d, StdConv2d
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 480, 480), 'pool_size': (7, 7),
-        'crop_pct': 1.0, 'interpolation': 'bilinear',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'stem.conv', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # pretrained on imagenet21k, finetuned on imagenet1k
-    'resnetv2_50x1_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R50x1-ILSVRC2012.npz'),
-    'resnetv2_50x3_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R50x3-ILSVRC2012.npz'),
-    'resnetv2_101x1_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R101x1-ILSVRC2012.npz'),
-    'resnetv2_101x3_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R101x3-ILSVRC2012.npz'),
-    'resnetv2_152x2_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R152x2-ILSVRC2012.npz'),
-    'resnetv2_152x4_bitm': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R152x4-ILSVRC2012.npz'),
-
-    # trained on imagenet-21k
-    'resnetv2_50x1_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R50x1.npz',
-        num_classes=21843),
-    'resnetv2_50x3_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R50x3.npz',
-        num_classes=21843),
-    'resnetv2_101x1_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R101x1.npz',
-        num_classes=21843),
-    'resnetv2_101x3_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R101x3.npz',
-        num_classes=21843),
-    'resnetv2_152x2_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R152x2.npz',
-        num_classes=21843),
-    'resnetv2_152x4_bitm_in21k': _cfg(
-        url='https://storage.googleapis.com/bit_models/BiT-M-R152x4.npz',
-        num_classes=21843),
-
-
-    # trained on imagenet-1k, NOTE not overly interesting set of weights, leaving disabled for now
-    # 'resnetv2_50x1_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R50x1.npz'),
-    # 'resnetv2_50x3_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R50x3.npz'),
-    # 'resnetv2_101x1_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R101x3.npz'),
-    # 'resnetv2_101x3_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R101x3.npz'),
-    # 'resnetv2_152x2_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R152x2.npz'),
-    # 'resnetv2_152x4_bits': _cfg(
-    #     url='https://storage.googleapis.com/bit_models/BiT-S-R152x4.npz'),
-}
-
-
-def make_div(v, divisor=8):
-    min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-def tf2th(conv_weights):
-    """Possibly convert HWIO to OIHW."""
-    if conv_weights.ndim == 4:
-        conv_weights = conv_weights.transpose([3, 2, 0, 1])
-    return torch.from_numpy(conv_weights)
-
-
-class PreActBottleneck(nn.Module):
-    """Pre-activation (v2) bottleneck block.
-
-    Follows the implementation of "Identity Mappings in Deep Residual Networks":
-    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
-
-    Except it puts the stride on 3x3 conv when available.
-    """
-
-    def __init__(
-            self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1,
-            act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.):
-        super().__init__()
-        first_dilation = first_dilation or dilation
-        conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(GroupNormAct, num_groups=32)
-        out_chs = out_chs or in_chs
-        mid_chs = make_div(out_chs * bottle_ratio)
-
-        if proj_layer is not None:
-            self.downsample = proj_layer(
-                in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, preact=True,
-                conv_layer=conv_layer, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-        self.norm1 = norm_layer(in_chs)
-        self.conv1 = conv_layer(in_chs, mid_chs, 1)
-        self.norm2 = norm_layer(mid_chs)
-        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-        self.norm3 = norm_layer(mid_chs)
-        self.conv3 = conv_layer(mid_chs, out_chs, 1)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-
-    def forward(self, x):
-        x_preact = self.norm1(x)
-
-        # shortcut branch
-        shortcut = x
-        if self.downsample is not None:
-            shortcut = self.downsample(x_preact)
-
-        # residual branch
-        x = self.conv1(x_preact)
-        x = self.conv2(self.norm2(x))
-        x = self.conv3(self.norm3(x))
-        x = self.drop_path(x)
-        return x + shortcut
-
-
-class Bottleneck(nn.Module):
-    """Non Pre-activation bottleneck block, equiv to V1.5/V1b Bottleneck. Used for ViT.
-    """
-    def __init__(
-            self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1,
-            act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.):
-        super().__init__()
-        first_dilation = first_dilation or dilation
-        act_layer = act_layer or nn.ReLU
-        conv_layer = conv_layer or StdConv2d
-        norm_layer = norm_layer or partial(GroupNormAct, num_groups=32)
-        out_chs = out_chs or in_chs
-        mid_chs = make_div(out_chs * bottle_ratio)
-
-        if proj_layer is not None:
-            self.downsample = proj_layer(
-                in_chs, out_chs, stride=stride, dilation=dilation, preact=False,
-                conv_layer=conv_layer, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-        self.conv1 = conv_layer(in_chs, mid_chs, 1)
-        self.norm1 = norm_layer(mid_chs)
-        self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups)
-        self.norm2 = norm_layer(mid_chs)
-        self.conv3 = conv_layer(mid_chs, out_chs, 1)
-        self.norm3 = norm_layer(out_chs, apply_act=False)
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.act3 = act_layer(inplace=True)
-
-    def forward(self, x):
-        # shortcut branch
-        shortcut = x
-        if self.downsample is not None:
-            shortcut = self.downsample(x)
-
-        # residual
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.conv2(x)
-        x = self.norm2(x)
-        x = self.conv3(x)
-        x = self.norm3(x)
-        x = self.drop_path(x)
-        x = self.act3(x + shortcut)
-        return x
-
-
-class DownsampleConv(nn.Module):
-    def __init__(
-            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, preact=True,
-            conv_layer=None, norm_layer=None):
-        super(DownsampleConv, self).__init__()
-        self.conv = conv_layer(in_chs, out_chs, 1, stride=stride)
-        self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False)
-
-    def forward(self, x):
-        return self.norm(self.conv(x))
-
-
-class DownsampleAvg(nn.Module):
-    def __init__(
-            self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None,
-            preact=True, conv_layer=None, norm_layer=None):
-        """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
-        super(DownsampleAvg, self).__init__()
-        avg_stride = stride if dilation == 1 else 1
-        if stride > 1 or dilation > 1:
-            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-        else:
-            self.pool = nn.Identity()
-        self.conv = conv_layer(in_chs, out_chs, 1, stride=1)
-        self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False)
-
-    def forward(self, x):
-        return self.norm(self.conv(self.pool(x)))
-
-
-class ResNetStage(nn.Module):
-    """ResNet Stage."""
-    def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio=0.25, groups=1,
-                 avg_down=False, block_dpr=None, block_fn=PreActBottleneck,
-                 act_layer=None, conv_layer=None, norm_layer=None, **block_kwargs):
-        super(ResNetStage, self).__init__()
-        first_dilation = 1 if dilation in (1, 2) else 2
-        layer_kwargs = dict(act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer)
-        proj_layer = DownsampleAvg if avg_down else DownsampleConv
-        prev_chs = in_chs
-        self.blocks = nn.Sequential()
-        for block_idx in range(depth):
-            drop_path_rate = block_dpr[block_idx] if block_dpr else 0.
-            stride = stride if block_idx == 0 else 1
-            self.blocks.add_module(str(block_idx), block_fn(
-                prev_chs, out_chs, stride=stride, dilation=dilation, bottle_ratio=bottle_ratio, groups=groups,
-                first_dilation=first_dilation, proj_layer=proj_layer, drop_path_rate=drop_path_rate,
-                **layer_kwargs, **block_kwargs))
-            prev_chs = out_chs
-            first_dilation = dilation
-            proj_layer = None
-
-    def forward(self, x):
-        x = self.blocks(x)
-        return x
-
-
-def create_resnetv2_stem(
-        in_chs, out_chs=64, stem_type='', preact=True,
-        conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32)):
-    stem = OrderedDict()
-    assert stem_type in ('', 'fixed', 'same', 'deep', 'deep_fixed', 'deep_same')
-
-    # NOTE conv padding mode can be changed by overriding the conv_layer def
-    if 'deep' in stem_type:
-        # A 3 deep 3x3  conv stack as in ResNet V1D models
-        mid_chs = out_chs // 2
-        stem['conv1'] = conv_layer(in_chs, mid_chs, kernel_size=3, stride=2)
-        stem['conv2'] = conv_layer(mid_chs, mid_chs, kernel_size=3, stride=1)
-        stem['conv3'] = conv_layer(mid_chs, out_chs, kernel_size=3, stride=1)
-    else:
-        # The usual 7x7 stem conv
-        stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2)
-
-    if not preact:
-        stem['norm'] = norm_layer(out_chs)
-
-    if 'fixed' in stem_type:
-        # 'fixed' SAME padding approximation that is used in BiT models
-        stem['pad'] = nn.ConstantPad2d(1, 0.)
-        stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=0)
-    elif 'same' in stem_type:
-        # full, input size based 'SAME' padding, used in ViT Hybrid model
-        stem['pool'] = create_pool2d('max', kernel_size=3, stride=2, padding='same')
-    else:
-        # the usual PyTorch symmetric padding
-        stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-    return nn.Sequential(stem)
-
-
-class ResNetV2(nn.Module):
-    """Implementation of Pre-activation (v2) ResNet mode.
-    """
-
-    def __init__(self, layers, channels=(256, 512, 1024, 2048),
-                 num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
-                 width_factor=1, stem_chs=64, stem_type='', avg_down=False, preact=True,
-                 act_layer=nn.ReLU, conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32),
-                 drop_rate=0., drop_path_rate=0.):
-        super().__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        wf = width_factor
-
-        self.feature_info = []
-        stem_chs = make_div(stem_chs * wf)
-        self.stem = create_resnetv2_stem(
-            in_chans, stem_chs, stem_type, preact, conv_layer=conv_layer, norm_layer=norm_layer)
-        stem_feat = ('stem.conv3' if 'deep' in stem_type else 'stem.conv') if preact else 'stem.norm'
-        self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=stem_feat))
-
-        prev_chs = stem_chs
-        curr_stride = 4
-        dilation = 1
-        block_dprs = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(layers)).split(layers)]
-        block_fn = PreActBottleneck if preact else Bottleneck
-        self.stages = nn.Sequential()
-        for stage_idx, (d, c, bdpr) in enumerate(zip(layers, channels, block_dprs)):
-            out_chs = make_div(c * wf)
-            stride = 1 if stage_idx == 0 else 2
-            if curr_stride >= output_stride:
-                dilation *= stride
-                stride = 1
-            stage = ResNetStage(
-                prev_chs, out_chs, stride=stride, dilation=dilation, depth=d, avg_down=avg_down,
-                act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer, block_dpr=bdpr, block_fn=block_fn)
-            prev_chs = out_chs
-            curr_stride *= stride
-            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{stage_idx}')]
-            self.stages.add_module(str(stage_idx), stage)
-
-        self.num_features = prev_chs
-        self.norm = norm_layer(self.num_features) if preact else nn.Identity()
-        self.head = ClassifierHead(
-            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True)
-
-        for n, m in self.named_modules():
-            if isinstance(m, nn.Linear) or ('.fc' in n and isinstance(m, nn.Conv2d)):
-                nn.init.normal_(m.weight, mean=0.0, std=0.01)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.head = ClassifierHead(
-            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.stages(x)
-        x = self.norm(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        if not self.head.global_pool.is_identity():
-            x = x.flatten(1)  # conv classifier, flatten if pooling isn't pass-through (disabled)
-        return x
-
-    def load_pretrained(self, checkpoint_path, prefix='resnet/'):
-        import numpy as np
-        weights = np.load(checkpoint_path)
-        with torch.no_grad():
-            stem_conv_w = tf2th(weights[f'{prefix}root_block/standardized_conv2d/kernel'])
-            if self.stem.conv.weight.shape[1] == 1:
-                self.stem.conv.weight.copy_(stem_conv_w.sum(dim=1, keepdim=True))
-                # FIXME handle > 3 in_chans?
-            else:
-                self.stem.conv.weight.copy_(stem_conv_w)
-            self.norm.weight.copy_(tf2th(weights[f'{prefix}group_norm/gamma']))
-            self.norm.bias.copy_(tf2th(weights[f'{prefix}group_norm/beta']))
-            if self.head.fc.weight.shape[0] == weights[f'{prefix}head/conv2d/kernel'].shape[-1]:
-                self.head.fc.weight.copy_(tf2th(weights[f'{prefix}head/conv2d/kernel']))
-                self.head.fc.bias.copy_(tf2th(weights[f'{prefix}head/conv2d/bias']))
-            for i, (sname, stage) in enumerate(self.stages.named_children()):
-                for j, (bname, block) in enumerate(stage.blocks.named_children()):
-                    convname = 'standardized_conv2d'
-                    block_prefix = f'{prefix}block{i + 1}/unit{j + 1:02d}/'
-                    block.conv1.weight.copy_(tf2th(weights[f'{block_prefix}a/{convname}/kernel']))
-                    block.conv2.weight.copy_(tf2th(weights[f'{block_prefix}b/{convname}/kernel']))
-                    block.conv3.weight.copy_(tf2th(weights[f'{block_prefix}c/{convname}/kernel']))
-                    block.norm1.weight.copy_(tf2th(weights[f'{block_prefix}a/group_norm/gamma']))
-                    block.norm2.weight.copy_(tf2th(weights[f'{block_prefix}b/group_norm/gamma']))
-                    block.norm3.weight.copy_(tf2th(weights[f'{block_prefix}c/group_norm/gamma']))
-                    block.norm1.bias.copy_(tf2th(weights[f'{block_prefix}a/group_norm/beta']))
-                    block.norm2.bias.copy_(tf2th(weights[f'{block_prefix}b/group_norm/beta']))
-                    block.norm3.bias.copy_(tf2th(weights[f'{block_prefix}c/group_norm/beta']))
-                    if block.downsample is not None:
-                        w = weights[f'{block_prefix}a/proj/{convname}/kernel']
-                        block.downsample.conv.weight.copy_(tf2th(w))
-
-
-def _create_resnetv2(variant, pretrained=False, **kwargs):
-    feature_cfg = dict(flatten_sequential=True)
-    return build_model_with_cfg(
-        ResNetV2, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=feature_cfg,
-        pretrained_custom_load=True,
-        **kwargs)
-
-
-@register_model
-def resnetv2_50x1_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_50x1_bitm', pretrained=pretrained,
-        layers=[3, 4, 6, 3], width_factor=1, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_50x3_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_50x3_bitm', pretrained=pretrained,
-        layers=[3, 4, 6, 3], width_factor=3, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_101x1_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_101x1_bitm', pretrained=pretrained,
-        layers=[3, 4, 23, 3], width_factor=1, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_101x3_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_101x3_bitm', pretrained=pretrained,
-        layers=[3, 4, 23, 3], width_factor=3, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_152x2_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_152x2_bitm', pretrained=pretrained,
-        layers=[3, 8, 36, 3], width_factor=2, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_152x4_bitm(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_152x4_bitm', pretrained=pretrained,
-        layers=[3, 8, 36, 3], width_factor=4, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_50x1_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_50x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 4, 6, 3], width_factor=1, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_50x3_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_50x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 4, 6, 3], width_factor=3, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_101x1_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_101x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 4, 23, 3], width_factor=1, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_101x3_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_101x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 4, 23, 3], width_factor=3, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_152x2_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_152x2_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 8, 36, 3], width_factor=2, stem_type='fixed', **kwargs)
-
-
-@register_model
-def resnetv2_152x4_bitm_in21k(pretrained=False, **kwargs):
-    return _create_resnetv2(
-        'resnetv2_152x4_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843),
-        layers=[3, 8, 36, 3], width_factor=4, stem_type='fixed', **kwargs)
-
-
-# NOTE the 'S' versions of the model weights arent as interesting as original 21k or transfer to 1K M.
-
-# @register_model
-# def resnetv2_50x1_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_50x1_bits', pretrained=pretrained,
-#         layers=[3, 4, 6, 3], width_factor=1, stem_type='fixed', **kwargs)
-#
-#
-# @register_model
-# def resnetv2_50x3_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_50x3_bits', pretrained=pretrained,
-#         layers=[3, 4, 6, 3], width_factor=3, stem_type='fixed', **kwargs)
-#
-#
-# @register_model
-# def resnetv2_101x1_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_101x1_bits', pretrained=pretrained,
-#         layers=[3, 4, 23, 3], width_factor=1, stem_type='fixed', **kwargs)
-#
-#
-# @register_model
-# def resnetv2_101x3_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_101x3_bits', pretrained=pretrained,
-#         layers=[3, 4, 23, 3], width_factor=3, stem_type='fixed', **kwargs)
-#
-#
-# @register_model
-# def resnetv2_152x2_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_152x2_bits', pretrained=pretrained,
-#         layers=[3, 8, 36, 3], width_factor=2, stem_type='fixed', **kwargs)
-#
-#
-# @register_model
-# def resnetv2_152x4_bits(pretrained=False, **kwargs):
-#     return _create_resnetv2(
-#         'resnetv2_152x4_bits', pretrained=pretrained,
-#         layers=[3, 8, 36, 3], width_factor=4, stem_type='fixed', **kwargs)
-#
diff --git a/AVLFormer/src/timm/models/rexnet.py b/AVLFormer/src/timm/models/rexnet.py
deleted file mode 100644
index e41003f..0000000
--- a/AVLFormer/src/timm/models/rexnet.py
+++ /dev/null
@@ -1,253 +0,0 @@
-""" ReXNet
-
-A PyTorch impl of `ReXNet: Diminishing Representational Bottleneck on Convolutional Neural Network` -
-https://arxiv.org/abs/2007.00992
-
-Adapted from original impl at https://github.com/clovaai/rexnet
-Copyright (c) 2020-present NAVER Corp. MIT license
-
-Changes for timm, feature extraction, and rounded channel variant hacked together by Ross Wightman
-Copyright 2020 Ross Wightman
-"""
-
-import torch.nn as nn
-from math import ceil
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath, make_divisible
-from .registry import register_model
-from .efficientnet_builder import efficientnet_init_weights
-
-
-def _cfg(url=''):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.conv', 'classifier': 'head.fc',
-    }
-
-
-default_cfgs = dict(
-    rexnet_100=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_100-1b4dddf4.pth'),
-    rexnet_130=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_130-590d768e.pth'),
-    rexnet_150=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_150-bd1a6aa8.pth'),
-    rexnet_200=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_200-8c0b7f2d.pth'),
-    rexnetr_100=_cfg(
-        url=''),
-    rexnetr_130=_cfg(
-        url=''),
-    rexnetr_150=_cfg(
-        url=''),
-    rexnetr_200=_cfg(
-        url=''),
-)
-
-
-class SEWithNorm(nn.Module):
-
-    def __init__(self, channels, se_ratio=1 / 12., act_layer=nn.ReLU, divisor=1, reduction_channels=None,
-                 gate_layer='sigmoid'):
-        super(SEWithNorm, self).__init__()
-        reduction_channels = reduction_channels or make_divisible(int(channels * se_ratio), divisor=divisor)
-        self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True)
-        self.bn = nn.BatchNorm2d(reduction_channels)
-        self.act = act_layer(inplace=True)
-        self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, bias=True)
-        self.gate = create_act_layer(gate_layer)
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.fc1(x_se)
-        x_se = self.bn(x_se)
-        x_se = self.act(x_se)
-        x_se = self.fc2(x_se)
-        return x * self.gate(x_se)
-
-
-class LinearBottleneck(nn.Module):
-    def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, se_ratio=0., ch_div=1,
-                 act_layer='swish', dw_act_layer='relu6', drop_path=None):
-        super(LinearBottleneck, self).__init__()
-        self.use_shortcut = stride == 1 and in_chs <= out_chs
-        self.in_channels = in_chs
-        self.out_channels = out_chs
-
-        if exp_ratio != 1.:
-            dw_chs = make_divisible(round(in_chs * exp_ratio), divisor=ch_div)
-            self.conv_exp = ConvBnAct(in_chs, dw_chs, act_layer=act_layer)
-        else:
-            dw_chs = in_chs
-            self.conv_exp = None
-
-        self.conv_dw = ConvBnAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False)
-        self.se = SEWithNorm(dw_chs, se_ratio=se_ratio, divisor=ch_div) if se_ratio > 0. else None
-        self.act_dw = create_act_layer(dw_act_layer)
-
-        self.conv_pwl = ConvBnAct(dw_chs, out_chs, 1, apply_act=False)
-        self.drop_path = drop_path
-
-    def feat_channels(self, exp=False):
-        return self.conv_dw.out_channels if exp else self.out_channels
-
-    def forward(self, x):
-        shortcut = x
-        if self.conv_exp is not None:
-            x = self.conv_exp(x)
-        x = self.conv_dw(x)
-        if self.se is not None:
-            x = self.se(x)
-        x = self.act_dw(x)
-        x = self.conv_pwl(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        if self.use_shortcut:
-            x[:, 0:self.in_channels] += shortcut
-        return x
-
-
-def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, se_ratio=0., ch_div=1):
-    layers = [1, 2, 2, 3, 3, 5]
-    strides = [1, 2, 2, 2, 1, 2]
-    layers = [ceil(element * depth_mult) for element in layers]
-    strides = sum([[element] + [1] * (layers[idx] - 1) for idx, element in enumerate(strides)], [])
-    exp_ratios = [1] * layers[0] + [6] * sum(layers[1:])
-    depth = sum(layers[:]) * 3
-    base_chs = initial_chs / width_mult if width_mult < 1.0 else initial_chs
-
-    # The following channel configuration is a simple instance to make each layer become an expand layer.
-    out_chs_list = []
-    for i in range(depth // 3):
-        out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div))
-        base_chs += final_chs / (depth // 3 * 1.0)
-
-    se_ratios = [0.] * (layers[0] + layers[1]) + [se_ratio] * sum(layers[2:])
-
-    return list(zip(out_chs_list, exp_ratios, strides, se_ratios))
-
-
-def _build_blocks(
-        block_cfg, prev_chs, width_mult, ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_path_rate=0.):
-    feat_chs = [prev_chs]
-    feature_info = []
-    curr_stride = 2
-    features = []
-    num_blocks = len(block_cfg)
-    for block_idx, (chs, exp_ratio, stride, se_ratio) in enumerate(block_cfg):
-        if stride > 1:
-            fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}'
-            feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)]
-            curr_stride *= stride
-        block_dpr = drop_path_rate * block_idx / (num_blocks - 1)  # stochastic depth linear decay rule
-        drop_path = DropPath(block_dpr) if block_dpr > 0. else None
-        features.append(LinearBottleneck(
-            in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, se_ratio=se_ratio,
-            ch_div=ch_div, act_layer=act_layer, dw_act_layer=dw_act_layer, drop_path=drop_path))
-        prev_chs = chs
-        feat_chs += [features[-1].feat_channels()]
-    pen_chs = make_divisible(1280 * width_mult, divisor=ch_div)
-    feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=f'features.{len(features) - 1}')]
-    features.append(ConvBnAct(prev_chs, pen_chs, act_layer=act_layer))
-    return features, feature_info
-
-
-class ReXNetV1(nn.Module):
-    def __init__(self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32,
-                 initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, se_ratio=1/12.,
-                 ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_rate=0.2, drop_path_rate=0.):
-        super(ReXNetV1, self).__init__()
-        self.drop_rate = drop_rate
-        self.num_classes = num_classes
-
-        assert output_stride == 32  # FIXME support dilation
-        stem_base_chs = 32 / width_mult if width_mult < 1.0 else 32
-        stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div)
-        self.stem = ConvBnAct(in_chans, stem_chs, 3, stride=2, act_layer=act_layer)
-
-        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div)
-        features, self.feature_info = _build_blocks(
-            block_cfg, stem_chs, width_mult, ch_div, act_layer, dw_act_layer, drop_path_rate)
-        self.num_features = features[-1].out_channels
-        self.features = nn.Sequential(*features)
-
-        self.head = ClassifierHead(self.num_features, num_classes, global_pool, drop_rate)
-
-        efficientnet_init_weights(self)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.features(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_rexnet(variant, pretrained, **kwargs):
-    feature_cfg = dict(flatten_sequential=True)
-    return build_model_with_cfg(
-        ReXNetV1, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=feature_cfg,
-        **kwargs)
-
-
-@register_model
-def rexnet_100(pretrained=False, **kwargs):
-    """ReXNet V1 1.0x"""
-    return _create_rexnet('rexnet_100', pretrained, **kwargs)
-
-
-@register_model
-def rexnet_130(pretrained=False, **kwargs):
-    """ReXNet V1 1.3x"""
-    return _create_rexnet('rexnet_130', pretrained, width_mult=1.3, **kwargs)
-
-
-@register_model
-def rexnet_150(pretrained=False, **kwargs):
-    """ReXNet V1 1.5x"""
-    return _create_rexnet('rexnet_150', pretrained, width_mult=1.5, **kwargs)
-
-
-@register_model
-def rexnet_200(pretrained=False, **kwargs):
-    """ReXNet V1 2.0x"""
-    return _create_rexnet('rexnet_200', pretrained, width_mult=2.0, **kwargs)
-
-
-@register_model
-def rexnetr_100(pretrained=False, **kwargs):
-    """ReXNet V1 1.0x w/ rounded (mod 8) channels"""
-    return _create_rexnet('rexnetr_100', pretrained, ch_div=8, **kwargs)
-
-
-@register_model
-def rexnetr_130(pretrained=False, **kwargs):
-    """ReXNet V1 1.3x w/ rounded (mod 8) channels"""
-    return _create_rexnet('rexnetr_130', pretrained, width_mult=1.3, ch_div=8, **kwargs)
-
-
-@register_model
-def rexnetr_150(pretrained=False, **kwargs):
-    """ReXNet V1 1.5x w/ rounded (mod 8) channels"""
-    return _create_rexnet('rexnetr_150', pretrained, width_mult=1.5, ch_div=8, **kwargs)
-
-
-@register_model
-def rexnetr_200(pretrained=False, **kwargs):
-    """ReXNet V1 2.0x w/ rounded (mod 8) channels"""
-    return _create_rexnet('rexnetr_200', pretrained, width_mult=2.0, ch_div=8, **kwargs)
diff --git a/AVLFormer/src/timm/models/selecsls.py b/AVLFormer/src/timm/models/selecsls.py
deleted file mode 100644
index 8735faa..0000000
--- a/AVLFormer/src/timm/models/selecsls.py
+++ /dev/null
@@ -1,362 +0,0 @@
-"""PyTorch SelecSLS Net example for ImageNet Classification
-License: CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/legalcode)
-Author: Dushyant Mehta (@mehtadushy)
-
-SelecSLS (core) Network Architecture as proposed in "XNect: Real-time Multi-person 3D
-Human Pose Estimation with a Single RGB Camera, Mehta et al."
-https://arxiv.org/abs/1907.00837
-
-Based on ResNet implementation in https://github.com/rwightman/pytorch-image-models
-and SelecSLS Net implementation in https://github.com/mehtadushy/SelecSLS-Pytorch
-"""
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['SelecSLS']  # model_registry will add each entrypoint fn to this
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (4, 4),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.0', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'selecsls42': _cfg(
-        url='',
-        interpolation='bicubic'),
-    'selecsls42b': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls42b-8af30141.pth',
-        interpolation='bicubic'),
-    'selecsls60': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60-bbf87526.pth',
-        interpolation='bicubic'),
-    'selecsls60b': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60b-94e619b5.pth',
-        interpolation='bicubic'),
-    'selecsls84': _cfg(
-        url='',
-        interpolation='bicubic'),
-}
-
-
-class SequentialList(nn.Sequential):
-
-    def __init__(self, *args):
-        super(SequentialList, self).__init__(*args)
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (List[torch.Tensor]) -> (List[torch.Tensor])
-        pass
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (torch.Tensor) -> (List[torch.Tensor])
-        pass
-
-    def forward(self, x) -> List[torch.Tensor]:
-        for module in self:
-            x = module(x)
-        return x
-
-
-class SelectSeq(nn.Module):
-    def __init__(self, mode='index', index=0):
-        super(SelectSeq, self).__init__()
-        self.mode = mode
-        self.index = index
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (List[torch.Tensor]) -> (torch.Tensor)
-        pass
-
-    @torch.jit._overload_method  # noqa: F811
-    def forward(self, x):
-        # type: (Tuple[torch.Tensor]) -> (torch.Tensor)
-        pass
-
-    def forward(self, x) -> torch.Tensor:
-        if self.mode == 'index':
-            return x[self.index]
-        else:
-            return torch.cat(x, dim=1)
-
-
-def conv_bn(in_chs, out_chs, k=3, stride=1, padding=None, dilation=1):
-    if padding is None:
-        padding = ((stride - 1) + dilation * (k - 1)) // 2
-    return nn.Sequential(
-        nn.Conv2d(in_chs, out_chs, k, stride, padding=padding, dilation=dilation, bias=False),
-        nn.BatchNorm2d(out_chs),
-        nn.ReLU(inplace=True)
-    )
-
-
-class SelecSLSBlock(nn.Module):
-    def __init__(self, in_chs, skip_chs, mid_chs, out_chs, is_first, stride, dilation=1):
-        super(SelecSLSBlock, self).__init__()
-        self.stride = stride
-        self.is_first = is_first
-        assert stride in [1, 2]
-
-        # Process input with 4 conv blocks with the same number of input and output channels
-        self.conv1 = conv_bn(in_chs, mid_chs, 3, stride, dilation=dilation)
-        self.conv2 = conv_bn(mid_chs, mid_chs, 1)
-        self.conv3 = conv_bn(mid_chs, mid_chs // 2, 3)
-        self.conv4 = conv_bn(mid_chs // 2, mid_chs, 1)
-        self.conv5 = conv_bn(mid_chs, mid_chs // 2, 3)
-        self.conv6 = conv_bn(2 * mid_chs + (0 if is_first else skip_chs), out_chs, 1)
-
-    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
-        if not isinstance(x, list):
-            x = [x]
-        assert len(x) in [1, 2]
-
-        d1 = self.conv1(x[0])
-        d2 = self.conv3(self.conv2(d1))
-        d3 = self.conv5(self.conv4(d2))
-        if self.is_first:
-            out = self.conv6(torch.cat([d1, d2, d3], 1))
-            return [out, out]
-        else:
-            return [self.conv6(torch.cat([d1, d2, d3, x[1]], 1)), x[1]]
-
-
-class SelecSLS(nn.Module):
-    """SelecSLS42 / SelecSLS60 / SelecSLS84
-
-    Parameters
-    ----------
-    cfg : network config dictionary specifying block type, feature, and head args
-    num_classes : int, default 1000
-        Number of classification classes.
-    in_chans : int, default 3
-        Number of input (color) channels.
-    drop_rate : float, default 0.
-        Dropout probability before classifier, for training
-    global_pool : str, default 'avg'
-        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
-    """
-
-    def __init__(self, cfg, num_classes=1000, in_chans=3, drop_rate=0.0, global_pool='avg'):
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        super(SelecSLS, self).__init__()
-
-        self.stem = conv_bn(in_chans, 32, stride=2)
-        self.features = SequentialList(*[cfg['block'](*block_args) for block_args in cfg['features']])
-        self.from_seq = SelectSeq()  # from List[tensor] -> Tensor in module compatible way
-        self.head = nn.Sequential(*[conv_bn(*conv_args) for conv_args in cfg['head']])
-        self.num_features = cfg['num_features']
-        self.feature_info = cfg['feature_info']
-
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-        for n, m in self.named_modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1.)
-                nn.init.constant_(m.bias, 0.)
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.features(x)
-        x = self.head(self.from_seq(x))
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x
-
-
-def _create_selecsls(variant, pretrained, **kwargs):
-    cfg = {}
-    feature_info = [dict(num_chs=32, reduction=2, module='stem.2')]
-    if variant.startswith('selecsls42'):
-        cfg['block'] = SelecSLSBlock
-        # Define configuration of the network after the initial neck
-        cfg['features'] = [
-            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
-            (32, 0, 64, 64, True, 2),
-            (64, 64, 64, 128, False, 1),
-            (128, 0, 144, 144, True, 2),
-            (144, 144, 144, 288, False, 1),
-            (288, 0, 304, 304, True, 2),
-            (304, 304, 304, 480, False, 1),
-        ]
-        feature_info.extend([
-            dict(num_chs=128, reduction=4, module='features.1'),
-            dict(num_chs=288, reduction=8, module='features.3'),
-            dict(num_chs=480, reduction=16, module='features.5'),
-        ])
-        # Head can be replaced with alternative configurations depending on the problem
-        feature_info.append(dict(num_chs=1024, reduction=32, module='head.1'))
-        if variant == 'selecsls42b':
-            cfg['head'] = [
-                (480, 960, 3, 2),
-                (960, 1024, 3, 1),
-                (1024, 1280, 3, 2),
-                (1280, 1024, 1, 1),
-            ]
-            feature_info.append(dict(num_chs=1024, reduction=64, module='head.3'))
-            cfg['num_features'] = 1024
-        else:
-            cfg['head'] = [
-                (480, 960, 3, 2),
-                (960, 1024, 3, 1),
-                (1024, 1024, 3, 2),
-                (1024, 1280, 1, 1),
-            ]
-            feature_info.append(dict(num_chs=1280, reduction=64, module='head.3'))
-            cfg['num_features'] = 1280
-
-    elif variant.startswith('selecsls60'):
-        cfg['block'] = SelecSLSBlock
-        # Define configuration of the network after the initial neck
-        cfg['features'] = [
-            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
-            (32, 0, 64, 64, True, 2),
-            (64, 64, 64, 128, False, 1),
-            (128, 0, 128, 128, True, 2),
-            (128, 128, 128, 128, False, 1),
-            (128, 128, 128, 288, False, 1),
-            (288, 0, 288, 288, True, 2),
-            (288, 288, 288, 288, False, 1),
-            (288, 288, 288, 288, False, 1),
-            (288, 288, 288, 416, False, 1),
-        ]
-        feature_info.extend([
-            dict(num_chs=128, reduction=4, module='features.1'),
-            dict(num_chs=288, reduction=8, module='features.4'),
-            dict(num_chs=416, reduction=16, module='features.8'),
-        ])
-        # Head can be replaced with alternative configurations depending on the problem
-        feature_info.append(dict(num_chs=1024, reduction=32, module='head.1'))
-        if variant == 'selecsls60b':
-            cfg['head'] = [
-                (416, 756, 3, 2),
-                (756, 1024, 3, 1),
-                (1024, 1280, 3, 2),
-                (1280, 1024, 1, 1),
-            ]
-            feature_info.append(dict(num_chs=1024, reduction=64, module='head.3'))
-            cfg['num_features'] = 1024
-        else:
-            cfg['head'] = [
-                (416, 756, 3, 2),
-                (756, 1024, 3, 1),
-                (1024, 1024, 3, 2),
-                (1024, 1280, 1, 1),
-            ]
-            feature_info.append(dict(num_chs=1280, reduction=64, module='head.3'))
-            cfg['num_features'] = 1280
-
-    elif variant == 'selecsls84':
-        cfg['block'] = SelecSLSBlock
-        # Define configuration of the network after the initial neck
-        cfg['features'] = [
-            # in_chs, skip_chs, mid_chs, out_chs, is_first, stride
-            (32, 0, 64, 64, True, 2),
-            (64, 64, 64, 144, False, 1),
-            (144, 0, 144, 144, True, 2),
-            (144, 144, 144, 144, False, 1),
-            (144, 144, 144, 144, False, 1),
-            (144, 144, 144, 144, False, 1),
-            (144, 144, 144, 304, False, 1),
-            (304, 0, 304, 304, True, 2),
-            (304, 304, 304, 304, False, 1),
-            (304, 304, 304, 304, False, 1),
-            (304, 304, 304, 304, False, 1),
-            (304, 304, 304, 304, False, 1),
-            (304, 304, 304, 512, False, 1),
-        ]
-        feature_info.extend([
-            dict(num_chs=144, reduction=4, module='features.1'),
-            dict(num_chs=304, reduction=8, module='features.6'),
-            dict(num_chs=512, reduction=16, module='features.12'),
-        ])
-        # Head can be replaced with alternative configurations depending on the problem
-        cfg['head'] = [
-            (512, 960, 3, 2),
-            (960, 1024, 3, 1),
-            (1024, 1024, 3, 2),
-            (1024, 1280, 3, 1),
-        ]
-        cfg['num_features'] = 1280
-        feature_info.extend([
-            dict(num_chs=1024, reduction=32, module='head.1'),
-            dict(num_chs=1280, reduction=64, module='head.3')
-        ])
-    else:
-        raise ValueError('Invalid net configuration ' + variant + ' !!!')
-    cfg['feature_info'] = feature_info
-
-    # this model can do 6 feature levels by default, unlike most others, leave as 0-4 to avoid surprises?
-    return build_model_with_cfg(
-        SelecSLS, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=cfg,
-        feature_cfg=dict(out_indices=(0, 1, 2, 3, 4), flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def selecsls42(pretrained=False, **kwargs):
-    """Constructs a SelecSLS42 model.
-    """
-    return _create_selecsls('selecsls42', pretrained, **kwargs)
-
-
-@register_model
-def selecsls42b(pretrained=False, **kwargs):
-    """Constructs a SelecSLS42_B model.
-    """
-    return _create_selecsls('selecsls42b', pretrained, **kwargs)
-
-
-@register_model
-def selecsls60(pretrained=False, **kwargs):
-    """Constructs a SelecSLS60 model.
-    """
-    return _create_selecsls('selecsls60', pretrained, **kwargs)
-
-
-@register_model
-def selecsls60b(pretrained=False, **kwargs):
-    """Constructs a SelecSLS60_B model.
-    """
-    return _create_selecsls('selecsls60b', pretrained, **kwargs)
-
-
-@register_model
-def selecsls84(pretrained=False, **kwargs):
-    """Constructs a SelecSLS84 model.
-    """
-    return _create_selecsls('selecsls84', pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/senet.py b/AVLFormer/src/timm/models/senet.py
deleted file mode 100644
index d83e4af..0000000
--- a/AVLFormer/src/timm/models/senet.py
+++ /dev/null
@@ -1,467 +0,0 @@
-"""
-SEResNet implementation from Cadene's pretrained models
-https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/senet.py
-Additional credit to https://github.com/creafz
-
-Original model: https://github.com/hujie-frank/SENet
-
-ResNet code gently borrowed from
-https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
-
-FIXME I'm deprecating this model and moving them to ResNet as I don't want to maintain duplicate
-support for extras like dilation, switchable BN/activations, feature extraction, etc that don't exist here.
-"""
-import math
-from collections import OrderedDict
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['SENet']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'layer0.conv1', 'classifier': 'last_linear',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'legacy_senet154':
-        _cfg(url='http://data.lip6.fr/cadene/pretrainedmodels/senet154-c7b49a05.pth'),
-    'legacy_seresnet18': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet18-4bb0ce65.pth',
-        interpolation='bicubic'),
-    'legacy_seresnet34': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet34-a4004e63.pth'),
-    'legacy_seresnet50': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet50-ce0d4300.pth'),
-    'legacy_seresnet101': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet101-7e38fcc6.pth'),
-    'legacy_seresnet152': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet152-d17c99b7.pth'),
-    'legacy_seresnext26_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26_32x4d-65ebdb501.pth',
-        interpolation='bicubic'),
-    'legacy_seresnext50_32x4d':
-        _cfg(url='http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth'),
-    'legacy_seresnext101_32x4d':
-        _cfg(url='http://data.lip6.fr/cadene/pretrainedmodels/se_resnext101_32x4d-3b2fe3d8.pth'),
-}
-
-
-def _weight_init(m):
-    if isinstance(m, nn.Conv2d):
-        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-    elif isinstance(m, nn.BatchNorm2d):
-        nn.init.constant_(m.weight, 1.)
-        nn.init.constant_(m.bias, 0.)
-
-
-class SEModule(nn.Module):
-
-    def __init__(self, channels, reduction):
-        super(SEModule, self).__init__()
-        self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1)
-        self.sigmoid = nn.Sigmoid()
-
-    def forward(self, x):
-        module_input = x
-        x = x.mean((2, 3), keepdim=True)
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.fc2(x)
-        x = self.sigmoid(x)
-        return module_input * x
-
-
-class Bottleneck(nn.Module):
-    """
-    Base class for bottlenecks that implements `forward()` method.
-    """
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out = self.se_module(out) + residual
-        out = self.relu(out)
-
-        return out
-
-
-class SEBottleneck(Bottleneck):
-    """
-    Bottleneck for SENet154.
-    """
-    expansion = 4
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1,
-                 downsample=None):
-        super(SEBottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes * 2)
-        self.conv2 = nn.Conv2d(
-            planes * 2, planes * 4, kernel_size=3, stride=stride,
-            padding=1, groups=groups, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes * 4)
-        self.conv3 = nn.Conv2d(
-            planes * 4, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SEResNetBottleneck(Bottleneck):
-    """
-    ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe
-    implementation and uses `stride=stride` in `conv1` and not in `conv2`
-    (the latter is used in the torchvision implementation of ResNet).
-    """
-    expansion = 4
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1,
-                 downsample=None):
-        super(SEResNetBottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(
-            inplanes, planes, kernel_size=1, bias=False, stride=stride)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, groups=groups, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SEResNeXtBottleneck(Bottleneck):
-    """
-    ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
-    """
-    expansion = 4
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1,
-                 downsample=None, base_width=4):
-        super(SEResNeXtBottleneck, self).__init__()
-        width = math.floor(planes * (base_width / 64)) * groups
-        self.conv1 = nn.Conv2d(
-            inplanes, width, kernel_size=1, bias=False, stride=1)
-        self.bn1 = nn.BatchNorm2d(width)
-        self.conv2 = nn.Conv2d(
-            width, width, kernel_size=3, stride=stride, padding=1, groups=groups, bias=False)
-        self.bn2 = nn.BatchNorm2d(width)
-        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes * 4, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-
-class SEResNetBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None):
-        super(SEResNetBlock, self).__init__()
-        self.conv1 = nn.Conv2d(
-            inplanes, planes, kernel_size=3, padding=1, stride=stride, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(
-            planes, planes, kernel_size=3, padding=1, groups=groups, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.se_module = SEModule(planes, reduction=reduction)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out = self.se_module(out) + residual
-        out = self.relu(out)
-
-        return out
-
-
-class SENet(nn.Module):
-
-    def __init__(self, block, layers, groups, reduction, drop_rate=0.2,
-                 in_chans=3, inplanes=64, input_3x3=False, downsample_kernel_size=1,
-                 downsample_padding=0, num_classes=1000, global_pool='avg'):
-        """
-        Parameters
-        ----------
-        block (nn.Module): Bottleneck class.
-            - For SENet154: SEBottleneck
-            - For SE-ResNet models: SEResNetBottleneck
-            - For SE-ResNeXt models:  SEResNeXtBottleneck
-        layers (list of ints): Number of residual blocks for 4 layers of the
-            network (layer1...layer4).
-        groups (int): Number of groups for the 3x3 convolution in each
-            bottleneck block.
-            - For SENet154: 64
-            - For SE-ResNet models: 1
-            - For SE-ResNeXt models:  32
-        reduction (int): Reduction ratio for Squeeze-and-Excitation modules.
-            - For all models: 16
-        dropout_p (float or None): Drop probability for the Dropout layer.
-            If `None` the Dropout layer is not used.
-            - For SENet154: 0.2
-            - For SE-ResNet models: None
-            - For SE-ResNeXt models: None
-        inplanes (int):  Number of input channels for layer1.
-            - For SENet154: 128
-            - For SE-ResNet models: 64
-            - For SE-ResNeXt models: 64
-        input_3x3 (bool): If `True`, use three 3x3 convolutions instead of
-            a single 7x7 convolution in layer0.
-            - For SENet154: True
-            - For SE-ResNet models: False
-            - For SE-ResNeXt models: False
-        downsample_kernel_size (int): Kernel size for downsampling convolutions
-            in layer2, layer3 and layer4.
-            - For SENet154: 3
-            - For SE-ResNet models: 1
-            - For SE-ResNeXt models: 1
-        downsample_padding (int): Padding for downsampling convolutions in
-            layer2, layer3 and layer4.
-            - For SENet154: 1
-            - For SE-ResNet models: 0
-            - For SE-ResNeXt models: 0
-        num_classes (int): Number of outputs in `last_linear` layer.
-            - For all models: 1000
-        """
-        super(SENet, self).__init__()
-        self.inplanes = inplanes
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        if input_3x3:
-            layer0_modules = [
-                ('conv1', nn.Conv2d(in_chans, 64, 3, stride=2, padding=1, bias=False)),
-                ('bn1', nn.BatchNorm2d(64)),
-                ('relu1', nn.ReLU(inplace=True)),
-                ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False)),
-                ('bn2', nn.BatchNorm2d(64)),
-                ('relu2', nn.ReLU(inplace=True)),
-                ('conv3', nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)),
-                ('bn3', nn.BatchNorm2d(inplanes)),
-                ('relu3', nn.ReLU(inplace=True)),
-            ]
-        else:
-            layer0_modules = [
-                ('conv1', nn.Conv2d(
-                    in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)),
-                ('bn1', nn.BatchNorm2d(inplanes)),
-                ('relu1', nn.ReLU(inplace=True)),
-            ]
-        self.layer0 = nn.Sequential(OrderedDict(layer0_modules))
-        # To preserve compatibility with Caffe weights `ceil_mode=True` is used instead of `padding=1`.
-        self.pool0 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
-        self.feature_info = [dict(num_chs=inplanes, reduction=2, module='layer0')]
-        self.layer1 = self._make_layer(
-            block,
-            planes=64,
-            blocks=layers[0],
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=1,
-            downsample_padding=0
-        )
-        self.feature_info += [dict(num_chs=64 * block.expansion, reduction=4, module='layer1')]
-        self.layer2 = self._make_layer(
-            block,
-            planes=128,
-            blocks=layers[1],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding
-        )
-        self.feature_info += [dict(num_chs=128 * block.expansion, reduction=8, module='layer2')]
-        self.layer3 = self._make_layer(
-            block,
-            planes=256,
-            blocks=layers[2],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding
-        )
-        self.feature_info += [dict(num_chs=256 * block.expansion, reduction=16, module='layer3')]
-        self.layer4 = self._make_layer(
-            block,
-            planes=512,
-            blocks=layers[3],
-            stride=2,
-            groups=groups,
-            reduction=reduction,
-            downsample_kernel_size=downsample_kernel_size,
-            downsample_padding=downsample_padding
-        )
-        self.feature_info += [dict(num_chs=512 * block.expansion, reduction=32, module='layer4')]
-        self.num_features = 512 * block.expansion
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-        for m in self.modules():
-            _weight_init(m)
-
-    def _make_layer(self, block, planes, blocks, groups, reduction, stride=1,
-                    downsample_kernel_size=1, downsample_padding=0):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(
-                    self.inplanes, planes * block.expansion, kernel_size=downsample_kernel_size,
-                    stride=stride, padding=downsample_padding, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = [block(self.inplanes, planes, groups, reduction, stride, downsample)]
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups, reduction))
-
-        return nn.Sequential(*layers)
-
-    def get_classifier(self):
-        return self.last_linear
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.last_linear = create_classifier(
-            self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.layer0(x)
-        x = self.pool0(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        return x
-
-    def logits(self, x):
-        x = self.global_pool(x)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
-        x = self.last_linear(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.logits(x)
-        return x
-
-
-def _create_senet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        SENet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def legacy_seresnet18(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNetBlock, layers=[2, 2, 2, 2], groups=1, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnet18', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnet34(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNetBlock, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnet34', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnet50(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNetBottleneck, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnet50', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnet101(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNetBottleneck, layers=[3, 4, 23, 3], groups=1, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnet101', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnet152(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNetBottleneck, layers=[3, 8, 36, 3], groups=1, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnet152', pretrained, **model_args)
-
-
-@register_model
-def legacy_senet154(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEBottleneck, layers=[3, 8, 36, 3], groups=64, reduction=16,
-        downsample_kernel_size=3, downsample_padding=1,  inplanes=128, input_3x3=True, **kwargs)
-    return _create_senet('legacy_senet154', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnext26_32x4d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNeXtBottleneck, layers=[2, 2, 2, 2], groups=32, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnext26_32x4d', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnext50_32x4d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNeXtBottleneck, layers=[3, 4, 6, 3], groups=32, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnext50_32x4d', pretrained, **model_args)
-
-
-@register_model
-def legacy_seresnext101_32x4d(pretrained=False, **kwargs):
-    model_args = dict(
-        block=SEResNeXtBottleneck, layers=[3, 4, 23, 3], groups=32, reduction=16, **kwargs)
-    return _create_senet('legacy_seresnext101_32x4d', pretrained, **model_args)
diff --git a/AVLFormer/src/timm/models/sknet.py b/AVLFormer/src/timm/models/sknet.py
deleted file mode 100644
index 1893ba6..0000000
--- a/AVLFormer/src/timm/models/sknet.py
+++ /dev/null
@@ -1,220 +0,0 @@
-""" Selective Kernel Networks (ResNet base)
-
-Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
-
-This was inspired by reading 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268)
-and a streamlined impl at https://github.com/clovaai/assembled-cnn but I ended up building something closer
-to the original paper with some modifications of my own to better balance param count vs accuracy.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import math
-
-from torch import nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import SelectiveKernelConv, ConvBnAct, create_attn
-from .registry import register_model
-from .resnet import ResNet
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'conv1', 'classifier': 'fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'skresnet18': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet18_ra-4eec2804.pth'),
-    'skresnet34': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet34_ra-bdc0ccde.pth'),
-    'skresnet50': _cfg(),
-    'skresnet50d': _cfg(
-        first_conv='conv1.0'),
-    'skresnext50_32x4d': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnext50_ra-f40e40bf.pth'),
-}
-
-
-class SelectiveKernelBasic(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
-                 sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU,
-                 norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
-        super(SelectiveKernelBasic, self).__init__()
-
-        sk_kwargs = sk_kwargs or {}
-        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
-        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
-        assert base_width == 64, 'BasicBlock doest not support changing base width'
-        first_planes = planes // reduce_first
-        outplanes = planes * self.expansion
-        first_dilation = first_dilation or dilation
-
-        self.conv1 = SelectiveKernelConv(
-            inplanes, first_planes, stride=stride, dilation=first_dilation, **conv_kwargs, **sk_kwargs)
-        conv_kwargs['act_layer'] = None
-        self.conv2 = ConvBnAct(
-            first_planes, outplanes, kernel_size=3, dilation=dilation, **conv_kwargs)
-        self.se = create_attn(attn_layer, outplanes)
-        self.act = act_layer(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.conv2.bn.weight)
-
-    def forward(self, x):
-        residual = x
-        x = self.conv1(x)
-        x = self.conv2(x)
-        if self.se is not None:
-            x = self.se(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-        x += residual
-        x = self.act(x)
-        return x
-
-
-class SelectiveKernelBottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=64, sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
-                 drop_block=None, drop_path=None):
-        super(SelectiveKernelBottleneck, self).__init__()
-
-        sk_kwargs = sk_kwargs or {}
-        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
-        width = int(math.floor(planes * (base_width / 64)) * cardinality)
-        first_planes = width // reduce_first
-        outplanes = planes * self.expansion
-        first_dilation = first_dilation or dilation
-
-        self.conv1 = ConvBnAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
-        self.conv2 = SelectiveKernelConv(
-            first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality,
-            **conv_kwargs, **sk_kwargs)
-        conv_kwargs['act_layer'] = None
-        self.conv3 = ConvBnAct(width, outplanes, kernel_size=1, **conv_kwargs)
-        self.se = create_attn(attn_layer, outplanes)
-        self.act = act_layer(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
-        nn.init.zeros_(self.conv3.bn.weight)
-
-    def forward(self, x):
-        residual = x
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.conv3(x)
-        if self.se is not None:
-            x = self.se(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        if self.downsample is not None:
-            residual = self.downsample(residual)
-        x += residual
-        x = self.act(x)
-        return x
-
-
-def _create_skresnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        ResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        **kwargs)
-
-
-@register_model
-def skresnet18(pretrained=False, **kwargs):
-    """Constructs a Selective Kernel ResNet-18 model.
-
-    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
-    variation splits the input channels to the selective convolutions to keep param count down.
-    """
-    sk_kwargs = dict(
-        min_attn_channels=16,
-        attn_reduction=8,
-        split_input=True)
-    model_args = dict(
-        block=SelectiveKernelBasic, layers=[2, 2, 2, 2], block_args=dict(sk_kwargs=sk_kwargs),
-        zero_init_last_bn=False, **kwargs)
-    return _create_skresnet('skresnet18', pretrained, **model_args)
-
-
-@register_model
-def skresnet34(pretrained=False, **kwargs):
-    """Constructs a Selective Kernel ResNet-34 model.
-
-    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
-    variation splits the input channels to the selective convolutions to keep param count down.
-    """
-    sk_kwargs = dict(
-        min_attn_channels=16,
-        attn_reduction=8,
-        split_input=True)
-    model_args = dict(
-        block=SelectiveKernelBasic, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs),
-        zero_init_last_bn=False, **kwargs)
-    return _create_skresnet('skresnet34', pretrained, **model_args)
-
-
-@register_model
-def skresnet50(pretrained=False, **kwargs):
-    """Constructs a Select Kernel ResNet-50 model.
-
-    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
-    variation splits the input channels to the selective convolutions to keep param count down.
-    """
-    sk_kwargs = dict(split_input=True)
-    model_args = dict(
-        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs),
-        zero_init_last_bn=False, **kwargs)
-    return _create_skresnet('skresnet50', pretrained, **model_args)
-
-
-@register_model
-def skresnet50d(pretrained=False, **kwargs):
-    """Constructs a Select Kernel ResNet-50-D model.
-
-    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
-    variation splits the input channels to the selective convolutions to keep param count down.
-    """
-    sk_kwargs = dict(split_input=True)
-    model_args = dict(
-        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
-        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
-    return _create_skresnet('skresnet50d', pretrained, **model_args)
-
-
-@register_model
-def skresnext50_32x4d(pretrained=False, **kwargs):
-    """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to
-    the SKNet-50 model in the Select Kernel Paper
-    """
-    model_args = dict(
-        block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4,
-        zero_init_last_bn=False, **kwargs)
-    return _create_skresnet('skresnext50_32x4d', pretrained, **model_args)
-
diff --git a/AVLFormer/src/timm/models/swin_transformer.py b/AVLFormer/src/timm/models/swin_transformer.py
deleted file mode 100644
index 14b4d7a..0000000
--- a/AVLFormer/src/timm/models/swin_transformer.py
+++ /dev/null
@@ -1,651 +0,0 @@
-""" Swin Transformer
-A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`
-    - https://arxiv.org/pdf/2103.14030
-
-Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below
-
-"""
-# --------------------------------------------------------
-# Swin Transformer
-# Copyright (c) 2021 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Ze Liu
-# --------------------------------------------------------
-import logging
-import math
-from copy import deepcopy
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as checkpoint
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg, overlay_external_default_cfg
-from .layers import DropPath, to_2tuple, trunc_normal_
-from .registry import register_model
-from .vision_transformer import checkpoint_filter_fn, Mlp, PatchEmbed, _init_vit_weights
-
-_logger = logging.getLogger(__name__)
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'patch_embed.proj', 'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # patch models (my experiments)
-    'swin_base_patch4_window12_384': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth',
-        input_size=(3, 384, 384), crop_pct=1.0),
-
-    'swin_base_patch4_window7_224': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth',
-    ),
-
-    'swin_large_patch4_window12_384': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth',
-        input_size=(3, 384, 384), crop_pct=1.0),
-
-    'swin_large_patch4_window7_224': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pth',
-    ),
-
-    'swin_small_patch4_window7_224': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth',
-    ),
-
-    'swin_tiny_patch4_window7_224': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth',
-    ),
-
-    'swin_base_patch4_window12_384_in22k': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth',
-        input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841),
-
-    'swin_base_patch4_window7_224_in22k': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pth',
-        num_classes=21841),
-
-    'swin_large_patch4_window12_384_in22k': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth',
-        input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841),
-
-    'swin_large_patch4_window7_224_in22k': _cfg(
-        url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pth',
-        num_classes=21841),
-
-}
-
-
-def window_partition(x, window_size: int):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size: int, H: int, W: int):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask: Optional[torch.Tensor] = None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        if self.shift_size > 0:
-            # calculate attention mask for SW-MSA
-            H, W = self.input_resolution
-            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-            h_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
-            w_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size, -self.shift_size),
-                        slice(-self.shift_size, None))
-            cnt = 0
-            for h in h_slices:
-                for w in w_slices:
-                    img_mask[:, h, w, :] = cnt
-                    cnt += 1
-
-            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def forward(self, x):
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            if not torch.jit.is_scripting() and self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-
-class SwinTransformer(nn.Module):
-    r""" Swin Transformer
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        num_classes (int): Number of classes for classification head. Default: 1000
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
-                 embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24),
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
-                 use_checkpoint=False, weight_init='', **kwargs):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        self.patch_grid = self.patch_embed.patch_grid
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-        else:
-            self.absolute_pos_embed = None
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build layers
-        layers = []
-        for i_layer in range(self.num_layers):
-            layers += [BasicLayer(
-                dim=int(embed_dim * 2 ** i_layer),
-                input_resolution=(self.patch_grid[0] // (2 ** i_layer), self.patch_grid[1] // (2 ** i_layer)),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=self.mlp_ratio,
-                qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                use_checkpoint=use_checkpoint)
-            ]
-        self.layers = nn.Sequential(*layers)
-
-        self.norm = norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-        assert weight_init in ('jax', 'jax_nlhb', 'nlhb', '')
-        head_bias = -math.log(self.num_classes) if 'nlhb' in weight_init else 0.
-        if weight_init.startswith('jax'):
-            for n, m in self.named_modules():
-                _init_vit_weights(m, n, head_bias=head_bias, jax_impl=True)
-        else:
-            self.apply(_init_vit_weights)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        if self.absolute_pos_embed is not None:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-        x = self.layers(x)
-        x = self.norm(x)  # B L C
-        x = self.avgpool(x.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_swin_transformer(variant, pretrained=False, default_cfg=None, **kwargs):
-    if default_cfg is None:
-        default_cfg = deepcopy(default_cfgs[variant])
-    overlay_external_default_cfg(default_cfg, kwargs)
-    default_num_classes = default_cfg['num_classes']
-    default_img_size = default_cfg['input_size'][-2:]
-
-    num_classes = kwargs.pop('num_classes', default_num_classes)
-    img_size = kwargs.pop('img_size', default_img_size)
-    if kwargs.get('features_only', None):
-        raise RuntimeError('features_only not implemented for Vision Transformer models.')
-
-    model = build_model_with_cfg(
-        SwinTransformer, variant, pretrained,
-        default_cfg=default_cfg,
-        img_size=img_size,
-        num_classes=num_classes,
-        pretrained_filter_fn=checkpoint_filter_fn,
-        **kwargs)
-
-    return model
-
-
-
-@register_model
-def swin_base_patch4_window12_384(pretrained=False, **kwargs):
-    """ Swin-B @ 384x384, pretrained ImageNet-22k, fine tune 1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
-    return _create_swin_transformer('swin_base_patch4_window12_384', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_base_patch4_window7_224(pretrained=False, **kwargs):
-    """ Swin-B @ 224x224, pretrained ImageNet-22k, fine tune 1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
-    return _create_swin_transformer('swin_base_patch4_window7_224', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_large_patch4_window12_384(pretrained=False, **kwargs):
-    """ Swin-L @ 384x384, pretrained ImageNet-22k, fine tune 1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
-    return _create_swin_transformer('swin_large_patch4_window12_384', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_large_patch4_window7_224(pretrained=False, **kwargs):
-    """ Swin-L @ 224x224, pretrained ImageNet-22k, fine tune 1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
-    return _create_swin_transformer('swin_large_patch4_window7_224', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_small_patch4_window7_224(pretrained=False, **kwargs):
-    """ Swin-S @ 224x224, trained ImageNet-1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs)
-    return _create_swin_transformer('swin_small_patch4_window7_224', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_tiny_patch4_window7_224(pretrained=False, **kwargs):
-    """ Swin-T @ 224x224, trained ImageNet-1k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs)
-    return _create_swin_transformer('swin_tiny_patch4_window7_224', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_base_patch4_window12_384_in22k(pretrained=False, **kwargs):
-    """ Swin-B @ 384x384, trained ImageNet-22k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
-    return _create_swin_transformer('swin_base_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_base_patch4_window7_224_in22k(pretrained=False, **kwargs):
-    """ Swin-B @ 224x224, trained ImageNet-22k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs)
-    return _create_swin_transformer('swin_base_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_large_patch4_window12_384_in22k(pretrained=False, **kwargs):
-    """ Swin-L @ 384x384, trained ImageNet-22k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
-    return _create_swin_transformer('swin_large_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def swin_large_patch4_window7_224_in22k(pretrained=False, **kwargs):
-    """ Swin-L @ 224x224, trained ImageNet-22k
-    """
-    model_kwargs = dict(
-        patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs)
-    return _create_swin_transformer('swin_large_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs)
\ No newline at end of file
diff --git a/AVLFormer/src/timm/models/tnt.py b/AVLFormer/src/timm/models/tnt.py
deleted file mode 100644
index aaa1289..0000000
--- a/AVLFormer/src/timm/models/tnt.py
+++ /dev/null
@@ -1,247 +0,0 @@
-""" Transformer in Transformer (TNT) in PyTorch
-
-A PyTorch implement of TNT as described in
-'Transformer in Transformer' - https://arxiv.org/abs/2103.00112
-
-The official mindspore code is released and available at
-https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/TNT
-"""
-import math
-import torch
-import torch.nn as nn
-from functools import partial
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from src.timm.models.helpers import load_pretrained
-from src.timm.models.layers import DropPath, trunc_normal_
-from src.timm.models.vision_transformer import Mlp
-from src.timm.models.registry import register_model
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'pixel_embed.proj', 'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'tnt_s_patch16_224': _cfg(
-        url='https://github.com/contrastive/pytorch-image-models/releases/download/TNT/tnt_s_patch16_224.pth.tar',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-    ),
-    'tnt_b_patch16_224': _cfg(
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-    ),
-}
-
-
-class Attention(nn.Module):
-    """ Multi-Head Attention
-    """
-    def __init__(self, dim, hidden_dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        head_dim = hidden_dim // num_heads
-        self.head_dim = head_dim
-        self.scale = head_dim ** -0.5
-
-        self.qk = nn.Linear(dim, hidden_dim * 2, bias=qkv_bias)
-        self.v = nn.Linear(dim, dim, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop, inplace=True)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop, inplace=True)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        qk = self.qk(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
-        q, k = qk[0], qk[1]   # make torchscript happy (cannot use tensor as tuple)
-        v = self.v(x).reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Module):
-    """ TNT Block
-    """
-    def __init__(self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4.,
-            qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        # Inner transformer
-        self.norm_in = norm_layer(in_dim)
-        self.attn_in = Attention(
-            in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias,
-            attn_drop=attn_drop, proj_drop=drop)
-        
-        self.norm_mlp_in = norm_layer(in_dim)
-        self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4),
-            out_features=in_dim, act_layer=act_layer, drop=drop)
-        
-        self.norm1_proj = norm_layer(in_dim)
-        self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True)
-        # Outer transformer
-        self.norm_out = norm_layer(dim)
-        self.attn_out = Attention(
-            dim, dim, num_heads=num_heads, qkv_bias=qkv_bias,
-            attn_drop=attn_drop, proj_drop=drop)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        
-        self.norm_mlp = norm_layer(dim)
-        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio),
-            out_features=dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, pixel_embed, patch_embed):
-        # inner
-        pixel_embed = pixel_embed + self.drop_path(self.attn_in(self.norm_in(pixel_embed)))
-        pixel_embed = pixel_embed + self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed)))
-        # outer
-        B, N, C = patch_embed.size()
-        patch_embed[:, 1:] = patch_embed[:, 1:] + self.proj(self.norm1_proj(pixel_embed).reshape(B, N - 1, -1))
-        patch_embed = patch_embed + self.drop_path(self.attn_out(self.norm_out(patch_embed)))
-        patch_embed = patch_embed + self.drop_path(self.mlp(self.norm_mlp(patch_embed)))
-        return pixel_embed, patch_embed
-
-
-class PixelEmbed(nn.Module):
-    """ Image to Pixel Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, in_dim=48, stride=4):
-        super().__init__()
-        num_patches = (img_size // patch_size) ** 2
-        self.img_size = img_size
-        self.num_patches = num_patches
-        self.in_dim = in_dim
-        new_patch_size = math.ceil(patch_size / stride)
-        self.new_patch_size = new_patch_size
-
-        self.proj = nn.Conv2d(in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride)
-        self.unfold = nn.Unfold(kernel_size=new_patch_size, stride=new_patch_size)
-
-    def forward(self, x, pixel_pos):
-        B, C, H, W = x.shape
-        assert H == self.img_size and W == self.img_size, \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size}*{self.img_size})."
-        x = self.proj(x)
-        x = self.unfold(x)
-        x = x.transpose(1, 2).reshape(B * self.num_patches, self.in_dim, self.new_patch_size, self.new_patch_size)
-        x = x + pixel_pos
-        x = x.reshape(B * self.num_patches, self.in_dim, -1).transpose(1, 2)
-        return x
-
-
-class TNT(nn.Module):
-    """ Transformer in Transformer - https://arxiv.org/abs/2103.00112
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, in_dim=48, depth=12,
-                 num_heads=12, in_num_head=4, mlp_ratio=4., qkv_bias=False, drop_rate=0., attn_drop_rate=0.,
-                 drop_path_rate=0., norm_layer=nn.LayerNorm, first_stride=4):
-        super().__init__()
-        self.num_classes = num_classes
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-
-        self.pixel_embed = PixelEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, in_dim=in_dim, stride=first_stride)
-        num_patches = self.pixel_embed.num_patches
-        self.num_patches = num_patches
-        new_patch_size = self.pixel_embed.new_patch_size
-        num_pixel = new_patch_size ** 2
-        
-        self.norm1_proj = norm_layer(num_pixel * in_dim)
-        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
-        self.norm2_proj = norm_layer(embed_dim)
-
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        self.patch_pos = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
-        self.pixel_pos = nn.Parameter(torch.zeros(1, in_dim, new_patch_size, new_patch_size))
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
-        blocks = []
-        for i in range(depth):
-            blocks.append(Block(
-                dim=embed_dim, in_dim=in_dim, num_pixel=num_pixel, num_heads=num_heads, in_num_head=in_num_head,
-                mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate,
-                drop_path=dpr[i], norm_layer=norm_layer))
-        self.blocks = nn.ModuleList(blocks)
-        self.norm = norm_layer(embed_dim)
-
-        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-        trunc_normal_(self.cls_token, std=.02)
-        trunc_normal_(self.patch_pos, std=.02)
-        trunc_normal_(self.pixel_pos, std=.02)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'patch_pos', 'pixel_pos', 'cls_token'}
-
-    def get_classifier(self):
-        return self.head
-
-    def reset_classifier(self, num_classes, global_pool=''):
-        self.num_classes = num_classes
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        B = x.shape[0]
-        pixel_embed = self.pixel_embed(x, self.pixel_pos)
-        
-        patch_embed = self.norm2_proj(self.proj(self.norm1_proj(pixel_embed.reshape(B, self.num_patches, -1))))
-        patch_embed = torch.cat((self.cls_token.expand(B, -1, -1), patch_embed), dim=1)
-        patch_embed = patch_embed + self.patch_pos
-        patch_embed = self.pos_drop(patch_embed)
-
-        for blk in self.blocks:
-            pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
-
-        patch_embed = self.norm(patch_embed)
-        return patch_embed[:, 0]
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-@register_model
-def tnt_s_patch16_224(pretrained=False, **kwargs):
-    model = TNT(patch_size=16, embed_dim=384, in_dim=24, depth=12, num_heads=6, in_num_head=4,
-        qkv_bias=False, **kwargs)
-    model.default_cfg = default_cfgs['tnt_s_patch16_224']
-    if pretrained:
-        load_pretrained(
-            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
-    return model
-
-
-@register_model
-def tnt_b_patch16_224(pretrained=False, **kwargs):
-    model = TNT(patch_size=16, embed_dim=640, in_dim=40, depth=12, num_heads=10, in_num_head=4,
-        qkv_bias=False, **kwargs)
-    model.default_cfg = default_cfgs['tnt_b_patch16_224']
-    if pretrained:
-        load_pretrained(
-            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
-    return model
diff --git a/AVLFormer/src/timm/models/tresnet.py b/AVLFormer/src/timm/models/tresnet.py
deleted file mode 100644
index a8c237e..0000000
--- a/AVLFormer/src/timm/models/tresnet.py
+++ /dev/null
@@ -1,295 +0,0 @@
-"""
-TResNet: High Performance GPU-Dedicated Architecture
-https://arxiv.org/pdf/2003.13630.pdf
-
-Original model: https://github.com/mrT23/TResNet
-
-"""
-import copy
-from collections import OrderedDict
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers import build_model_with_cfg
-from .layers import SpaceToDepthModule, AntiAliasDownsampleLayer, InplaceAbn, ClassifierHead, SEModule
-from .registry import register_model
-
-__all__ = ['tresnet_m', 'tresnet_l', 'tresnet_xl']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': (0, 0, 0), 'std': (1, 1, 1),
-        'first_conv': 'body.conv1.0', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'tresnet_m': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_80_8-dbc13962.pth'),
-    'tresnet_l': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_81_5-235b486c.pth'),
-    'tresnet_xl': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_82_0-a2d51b00.pth'),
-    'tresnet_m_448': _cfg(
-        input_size=(3, 448, 448), pool_size=(14, 14),
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_448-bc359d10.pth'),
-    'tresnet_l_448': _cfg(
-        input_size=(3, 448, 448), pool_size=(14, 14),
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_448-940d0cd1.pth'),
-    'tresnet_xl_448': _cfg(
-        input_size=(3, 448, 448), pool_size=(14, 14),
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_448-8c1815de.pth')
-}
-
-
-def IABN2Float(module: nn.Module) -> nn.Module:
-    """If `module` is IABN don't use half precision."""
-    if isinstance(module, InplaceAbn):
-        module.float()
-    for child in module.children():
-        IABN2Float(child)
-    return module
-
-
-def conv2d_iabn(ni, nf, stride, kernel_size=3, groups=1, act_layer="leaky_relu", act_param=1e-2):
-    return nn.Sequential(
-        nn.Conv2d(
-            ni, nf, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=groups, bias=False),
-        InplaceAbn(nf, act_layer=act_layer, act_param=act_param)
-    )
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, aa_layer=None):
-        super(BasicBlock, self).__init__()
-        if stride == 1:
-            self.conv1 = conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3)
-        else:
-            if aa_layer is None:
-                self.conv1 = conv2d_iabn(inplanes, planes, stride=2, act_param=1e-3)
-            else:
-                self.conv1 = nn.Sequential(
-                    conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3),
-                    aa_layer(channels=planes, filt_size=3, stride=2))
-
-        self.conv2 = conv2d_iabn(planes, planes, stride=1, act_layer="identity")
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-        reduction_chs = max(planes * self.expansion // 4, 64)
-        self.se = SEModule(planes * self.expansion, reduction_channels=reduction_chs) if use_se else None
-
-    def forward(self, x):
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        else:
-            residual = x
-
-        out = self.conv1(x)
-        out = self.conv2(out)
-
-        if self.se is not None:
-            out = self.se(out)
-
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True,
-                 act_layer="leaky_relu", aa_layer=None):
-        super(Bottleneck, self).__init__()
-        self.conv1 = conv2d_iabn(
-            inplanes, planes, kernel_size=1, stride=1, act_layer=act_layer, act_param=1e-3)
-        if stride == 1:
-            self.conv2 = conv2d_iabn(
-                planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3)
-        else:
-            if aa_layer is None:
-                self.conv2 = conv2d_iabn(
-                    planes, planes, kernel_size=3, stride=2, act_layer=act_layer, act_param=1e-3)
-            else:
-                self.conv2 = nn.Sequential(
-                    conv2d_iabn(planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3),
-                    aa_layer(channels=planes, filt_size=3, stride=2))
-
-        reduction_chs = max(planes * self.expansion // 8, 64)
-        self.se = SEModule(planes, reduction_channels=reduction_chs) if use_se else None
-
-        self.conv3 = conv2d_iabn(
-            planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity")
-
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        else:
-            residual = x
-
-        out = self.conv1(x)
-        out = self.conv2(out)
-        if self.se is not None:
-            out = self.se(out)
-
-        out = self.conv3(out)
-        out = out + residual  # no inplace
-        out = self.relu(out)
-
-        return out
-
-
-class TResNet(nn.Module):
-    def __init__(self, layers, in_chans=3, num_classes=1000, width_factor=1.0, no_aa_jit=False,
-                 global_pool='fast', drop_rate=0.):
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        super(TResNet, self).__init__()
-
-        # JIT layers
-        space_to_depth = SpaceToDepthModule()
-        aa_layer = partial(AntiAliasDownsampleLayer, no_jit=no_aa_jit)
-
-        # TResnet stages
-        self.inplanes = int(64 * width_factor)
-        self.planes = int(64 * width_factor)
-        conv1 = conv2d_iabn(in_chans * 16, self.planes, stride=1, kernel_size=3)
-        layer1 = self._make_layer(
-            BasicBlock, self.planes, layers[0], stride=1, use_se=True, aa_layer=aa_layer)  # 56x56
-        layer2 = self._make_layer(
-            BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, aa_layer=aa_layer)  # 28x28
-        layer3 = self._make_layer(
-            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, aa_layer=aa_layer)  # 14x14
-        layer4 = self._make_layer(
-            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, aa_layer=aa_layer)  # 7x7
-
-        # body
-        self.body = nn.Sequential(OrderedDict([
-            ('SpaceToDepth', space_to_depth),
-            ('conv1', conv1),
-            ('layer1', layer1),
-            ('layer2', layer2),
-            ('layer3', layer3),
-            ('layer4', layer4)]))
-
-        self.feature_info = [
-            dict(num_chs=self.planes, reduction=2, module=''),  # Not with S2D?
-            dict(num_chs=self.planes, reduction=4, module='body.layer1'),
-            dict(num_chs=self.planes * 2, reduction=8, module='body.layer2'),
-            dict(num_chs=self.planes * 4 * Bottleneck.expansion, reduction=16, module='body.layer3'),
-            dict(num_chs=self.planes * 8 * Bottleneck.expansion, reduction=32, module='body.layer4'),
-        ]
-
-        # head
-        self.num_features = (self.planes * 8) * Bottleneck.expansion
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-        # model initilization
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
-            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InplaceAbn):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # residual connections special initialization
-        for m in self.modules():
-            if isinstance(m, BasicBlock):
-                m.conv2[1].weight = nn.Parameter(torch.zeros_like(m.conv2[1].weight))  # BN to zero
-            if isinstance(m, Bottleneck):
-                m.conv3[1].weight = nn.Parameter(torch.zeros_like(m.conv3[1].weight))  # BN to zero
-            if isinstance(m, nn.Linear):
-                m.weight.data.normal_(0, 0.01)
-
-    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, aa_layer=None):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            layers = []
-            if stride == 2:
-                # avg pooling before 1x1 conv
-                layers.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True, count_include_pad=False))
-            layers += [conv2d_iabn(
-                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, act_layer="identity")]
-            downsample = nn.Sequential(*layers)
-
-        layers = []
-        layers.append(block(
-            self.inplanes, planes, stride, downsample, use_se=use_se, aa_layer=aa_layer))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(
-                block(self.inplanes, planes, use_se=use_se, aa_layer=aa_layer))
-        return nn.Sequential(*layers)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='fast'):
-        self.head = ClassifierHead(
-            self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        return self.body(x)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _create_tresnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        TResNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(out_indices=(1, 2, 3, 4), flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def tresnet_m(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs)
-    return _create_tresnet('tresnet_m', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def tresnet_l(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs)
-    return _create_tresnet('tresnet_l', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def tresnet_xl(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs)
-    return _create_tresnet('tresnet_xl', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def tresnet_m_448(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs)
-    return _create_tresnet('tresnet_m_448', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def tresnet_l_448(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs)
-    return _create_tresnet('tresnet_l_448', pretrained=pretrained, **model_kwargs)
-
-
-@register_model
-def tresnet_xl_448(pretrained=False, **kwargs):
-    model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs)
-    return _create_tresnet('tresnet_xl_448', pretrained=pretrained, **model_kwargs)
diff --git a/AVLFormer/src/timm/models/vgg.py b/AVLFormer/src/timm/models/vgg.py
deleted file mode 100644
index 2aefc3c..0000000
--- a/AVLFormer/src/timm/models/vgg.py
+++ /dev/null
@@ -1,261 +0,0 @@
-"""VGG
-
-Adapted from https://github.com/pytorch/vision 'vgg.py' (BSD-3-Clause) with a few changes for
-timm functionality.
-
-Copyright 2021 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Union, List, Dict, Any, cast
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct
-from .registry import register_model
-
-__all__ = [
-    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
-    'vgg19_bn', 'vgg19',
-]
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
-        'crop_pct': 0.875, 'interpolation': 'bilinear',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'features.0', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    'vgg11': _cfg(url='https://download.pytorch.org/models/vgg11-bbd30ac9.pth'),
-    'vgg13': _cfg(url='https://download.pytorch.org/models/vgg13-c768596a.pth'),
-    'vgg16': _cfg(url='https://download.pytorch.org/models/vgg16-397923af.pth'),
-    'vgg19': _cfg(url='https://download.pytorch.org/models/vgg19-dcbb9e9d.pth'),
-    'vgg11_bn': _cfg(url='https://download.pytorch.org/models/vgg11_bn-6002323d.pth'),
-    'vgg13_bn': _cfg(url='https://download.pytorch.org/models/vgg13_bn-abd245e5.pth'),
-    'vgg16_bn': _cfg(url='https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'),
-    'vgg19_bn': _cfg(url='https://download.pytorch.org/models/vgg19_bn-c79401a0.pth'),
-}
-
-
-cfgs: Dict[str, List[Union[str, int]]] = {
-    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
-    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
-}
-
-
-class ConvMlp(nn.Module):
-
-    def __init__(self, in_features=512, out_features=4096, kernel_size=7, mlp_ratio=1.0,
-                 drop_rate: float = 0.2, act_layer: nn.Module = None, conv_layer: nn.Module = None):
-        super(ConvMlp, self).__init__()
-        self.input_kernel_size = kernel_size
-        mid_features = int(out_features * mlp_ratio)
-        self.fc1 = conv_layer(in_features, mid_features, kernel_size, bias=True)
-        self.act1 = act_layer(True)
-        self.drop = nn.Dropout(drop_rate)
-        self.fc2 = conv_layer(mid_features, out_features, 1, bias=True)
-        self.act2 = act_layer(True)
-
-    def forward(self, x):
-        if x.shape[-2] < self.input_kernel_size or x.shape[-1] < self.input_kernel_size:
-            # keep the input size >= 7x7
-            output_size = (max(self.input_kernel_size, x.shape[-2]), max(self.input_kernel_size, x.shape[-1]))
-            x = F.adaptive_avg_pool2d(x, output_size)
-        x = self.fc1(x)
-        x = self.act1(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.act2(x)
-        return x
-
-
-class VGG(nn.Module):
-
-    def __init__(
-        self,
-        cfg: List[Any],
-        num_classes: int = 1000,
-        in_chans: int = 3,
-        output_stride: int = 32,
-        mlp_ratio: float = 1.0,
-        act_layer: nn.Module = nn.ReLU,
-        conv_layer: nn.Module = nn.Conv2d,
-        norm_layer: nn.Module = None,
-        global_pool: str = 'avg',
-        drop_rate: float = 0.,
-    ) -> None:
-        super(VGG, self).__init__()
-        assert output_stride == 32
-        self.num_classes = num_classes
-        self.num_features = 4096
-        self.drop_rate = drop_rate
-        self.feature_info = []
-        prev_chs = in_chans
-        net_stride = 1
-        pool_layer = nn.MaxPool2d
-        layers: List[nn.Module] = []
-        for v in cfg:
-            last_idx = len(layers) - 1
-            if v == 'M':
-                self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{last_idx}'))
-                layers += [pool_layer(kernel_size=2, stride=2)]
-                net_stride *= 2
-            else:
-                v = cast(int, v)
-                conv2d = conv_layer(prev_chs, v, kernel_size=3, padding=1)
-                if norm_layer is not None:
-                    layers += [conv2d, norm_layer(v), act_layer(inplace=True)]
-                else:
-                    layers += [conv2d, act_layer(inplace=True)]
-                prev_chs = v
-        self.features = nn.Sequential(*layers)
-        self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{len(layers) - 1}'))
-        self.pre_logits = ConvMlp(
-            prev_chs, self.num_features, 7, mlp_ratio=mlp_ratio,
-            drop_rate=drop_rate, act_layer=act_layer, conv_layer=conv_layer)
-        self.head = ClassifierHead(
-            self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-        self._initialize_weights()
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.head = ClassifierHead(
-            self.num_features, self.num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.features(x)
-        x = self.pre_logits(x)
-        return x
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-    def _initialize_weights(self) -> None:
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, 0, 0.01)
-                nn.init.constant_(m.bias, 0)
-
-
-def _filter_fn(state_dict):
-    """ convert patch embedding weight from manual patchify + linear proj to conv"""
-    out_dict = {}
-    for k, v in state_dict.items():
-        k_r = k
-        k_r = k_r.replace('classifier.0', 'pre_logits.fc1')
-        k_r = k_r.replace('classifier.3', 'pre_logits.fc2')
-        k_r = k_r.replace('classifier.6', 'head.fc')
-        if 'classifier.0.weight' in k:
-            v = v.reshape(-1, 512, 7, 7)
-        if 'classifier.3.weight' in k:
-            v = v.reshape(-1, 4096, 1, 1)
-        out_dict[k_r] = v
-    return out_dict
-
-
-def _create_vgg(variant: str, pretrained: bool, **kwargs: Any) -> VGG:
-    cfg = variant.split('_')[0]
-    # NOTE: VGG is one of the only models with stride==1 features, so indices are offset from other models
-    out_indices = kwargs.get('out_indices', (0, 1, 2, 3, 4, 5))
-    model = build_model_with_cfg(
-        VGG, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=cfgs[cfg],
-        feature_cfg=dict(flatten_sequential=True, out_indices=out_indices),
-        pretrained_filter_fn=_filter_fn,
-        **kwargs)
-    return model
-
-
-@register_model
-def vgg11(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 11-layer model (configuration "A") from
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(**kwargs)
-    return _create_vgg('vgg11', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg11_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 11-layer model (configuration "A") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
-    return _create_vgg('vgg11_bn', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg13(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 13-layer model (configuration "B")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(**kwargs)
-    return _create_vgg('vgg13', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg13_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 13-layer model (configuration "B") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
-    return _create_vgg('vgg13_bn', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg16(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 16-layer model (configuration "D")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(**kwargs)
-    return _create_vgg('vgg16', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 16-layer model (configuration "D") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
-    return _create_vgg('vgg16_bn', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg19(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 19-layer model (configuration "E")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(**kwargs)
-    return _create_vgg('vgg19', pretrained=pretrained, **model_args)
-
-
-@register_model
-def vgg19_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
-    r"""VGG 19-layer model (configuration 'E') with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
-    """
-    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
-    return _create_vgg('vgg19_bn', pretrained=pretrained, **model_args)
\ No newline at end of file
diff --git a/AVLFormer/src/timm/models/vision_transformer.py b/AVLFormer/src/timm/models/vision_transformer.py
deleted file mode 100644
index 0f77097..0000000
--- a/AVLFormer/src/timm/models/vision_transformer.py
+++ /dev/null
@@ -1,697 +0,0 @@
-""" Vision Transformer (ViT) in PyTorch
-
-A PyTorch implement of Vision Transformers as described in
-'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929
-
-The official jax code is released and available at https://github.com/google-research/vision_transformer
-
-DeiT model defs and weights from https://github.com/facebookresearch/deit,
-paper `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877
-
-Acknowledgments:
-* The paper authors for releasing code and weights, thanks!
-* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
-for some einops/einsum fun
-* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
-* Bert reference code checks against Huggingface Transformers and Tensorflow Bert
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import math
-import logging
-from functools import partial
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg, overlay_external_default_cfg
-from .layers import DropPath, to_2tuple, trunc_normal_, lecun_normal_
-from .registry import register_model
-
-_logger = logging.getLogger(__name__)
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'patch_embed.proj', 'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # patch models (my experiments)
-    'vit_small_patch16_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/vit_small_p16_224-15ec54c9.pth',
-    ),
-
-    # patch models (weights ported from official Google JAX impl)
-    'vit_base_patch16_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
-    ),
-    'vit_base_patch32_224': _cfg(
-        url='',  # no official model weights for this combo, only for in21k
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_base_patch16_384': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth',
-        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
-    'vit_base_patch32_384': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p32_384-830016f5.pth',
-        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
-    'vit_large_patch16_224': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_large_patch32_224': _cfg(
-        url='',  # no official model weights for this combo, only for in21k
-        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_large_patch16_384': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',
-        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
-    'vit_large_patch32_384': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
-        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
-
-    # patch models, imagenet21k (weights ported from official Google JAX impl)
-    'vit_base_patch16_224_in21k': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth',
-        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_base_patch32_224_in21k': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth',
-        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_large_patch16_224_in21k': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth',
-        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_large_patch32_224_in21k': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
-        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-    'vit_huge_patch14_224_in21k': _cfg(
-        hf_hub='timm/vit_huge_patch14_224_in21k',
-        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
-
-    # deit models (FB weights)
-    'vit_deit_tiny_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'),
-    'vit_deit_small_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'),
-    'vit_deit_base_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',),
-    'vit_deit_base_patch16_384': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
-        input_size=(3, 384, 384), crop_pct=1.0),
-    'vit_deit_tiny_distilled_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth',
-        classifier=('head', 'head_dist')),
-    'vit_deit_small_distilled_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
-        classifier=('head', 'head_dist')),
-    'vit_deit_base_distilled_patch16_224': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth',
-        classifier=('head', 'head_dist')),
-    'vit_deit_base_distilled_patch16_384': _cfg(
-        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
-        input_size=(3, 384, 384), crop_pct=1.0, classifier=('head', 'head_dist')),
-}
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, attn_mask=None):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        if attn_mask!=None:
-            attn = attn + attn_mask
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Module):
-
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-    def forward(self, x, attn_mask=None):
-        x = x + self.drop_path(self.attn(self.norm1(x), attn_mask=attn_mask))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patch_grid = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.patch_grid[0] * self.patch_grid[1]
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        x = self.norm(x)
-        return x
-
-
-class VisionTransformer(nn.Module):
-    """ Vision Transformer
-
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
-        - https://arxiv.org/abs/2010.11929
-
-    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
-        - https://arxiv.org/abs/2012.12877
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
-                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, distilled=False,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=PatchEmbed, norm_layer=None,
-                 act_layer=None, weight_init=''):
-        """
-        Args:
-            img_size (int, tuple): input image size
-            patch_size (int, tuple): patch size
-            in_chans (int): number of input channels
-            num_classes (int): number of classes for classification head
-            embed_dim (int): embedding dimension
-            depth (int): depth of transformer
-            num_heads (int): number of attention heads
-            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
-            qkv_bias (bool): enable bias for qkv if True
-            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
-            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
-            distilled (bool): model includes a distillation token and head as in DeiT models
-            drop_rate (float): dropout rate
-            attn_drop_rate (float): attention dropout rate
-            drop_path_rate (float): stochastic depth rate
-            embed_layer (nn.Module): patch embedding layer
-            norm_layer: (nn.Module): normalization layer
-            weight_init: (str): weight init scheme
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-        self.num_tokens = 2 if distilled else 1
-        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
-        act_layer = act_layer or nn.GELU
-
-        self.patch_embed = embed_layer(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
-        num_patches = self.patch_embed.num_patches
-
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
-        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
-        self.blocks = nn.Sequential(*[
-            Block(
-                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
-                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
-            for i in range(depth)])
-        self.norm = norm_layer(embed_dim)
-
-        # Representation layer
-        if representation_size and not distilled:
-            self.num_features = representation_size
-            self.pre_logits = nn.Sequential(OrderedDict([
-                ('fc', nn.Linear(embed_dim, representation_size)),
-                ('act', nn.Tanh())
-            ]))
-        else:
-            self.pre_logits = nn.Identity()
-
-        # Classifier head(s)
-        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-        self.head_dist = None
-        if distilled:
-            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
-
-        # Weight init
-        assert weight_init in ('jax', 'jax_nlhb', 'nlhb', '')
-        head_bias = -math.log(self.num_classes) if 'nlhb' in weight_init else 0.
-        trunc_normal_(self.pos_embed, std=.02)
-        if self.dist_token is not None:
-            trunc_normal_(self.dist_token, std=.02)
-        if weight_init.startswith('jax'):
-            # leave cls token as zeros to match jax impl
-            for n, m in self.named_modules():
-                _init_vit_weights(m, n, head_bias=head_bias, jax_impl=True)
-        else:
-            trunc_normal_(self.cls_token, std=.02)
-            self.apply(_init_vit_weights)
-
-    def _init_weights(self, m):
-        # this fn left here for compat with downstream users
-        _init_vit_weights(m)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token', 'dist_token'}
-
-    def get_classifier(self):
-        if self.dist_token is None:
-            return self.head
-        else:
-            return self.head, self.head_dist
-
-    def reset_classifier(self, num_classes, global_pool=''):
-        self.num_classes = num_classes
-        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        if self.num_tokens == 2:
-            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward_features(self, x):
-        x = self.patch_embed(x)
-        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
-        if self.dist_token is None:
-            x = torch.cat((cls_token, x), dim=1)
-        else:
-            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
-        x = self.pos_drop(x + self.pos_embed)
-        x = self.blocks(x)
-        x = self.norm(x)
-        return x # cc
-        # if self.dist_token is None:
-        #     return self.pre_logits(x[:, 0])
-        # else:
-        #     return x[:, 0], x[:, 1]
-
-    def forward(self, x):
-        # x = self.forward_features(x)
-        x_feat = self.forward_features(x)  # cc
-        if self.dist_token is None:
-            x = self.pre_logits(x_feat[:, 0])
-        else:
-            x = x_feat[:, 0], x_feat[:, 1]
-        if self.head_dist is not None:
-            x, x_dist = self.head(x[0]), self.head_dist(x[1])  # x must be a tuple
-            if self.training and not torch.jit.is_scripting():
-                # during inference, return the average of both classifier predictions
-                return x, x_dist
-            else:
-                return (x + x_dist) / 2
-        else:
-            x = self.head(x)
-        return x, x_feat
-
-
-def _init_vit_weights(m, n: str = '', head_bias: float = 0., jax_impl: bool = False):
-    """ ViT weight initialization
-    * When called without n, head_bias, jax_impl args it will behave exactly the same
-      as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
-    * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
-    """
-    if isinstance(m, nn.Linear):
-        if n.startswith('head'):
-            nn.init.zeros_(m.weight)
-            nn.init.constant_(m.bias, head_bias)
-        elif n.startswith('pre_logits'):
-            lecun_normal_(m.weight)
-            nn.init.zeros_(m.bias)
-        else:
-            if jax_impl:
-                nn.init.xavier_uniform_(m.weight)
-                if m.bias is not None:
-                    if 'mlp' in n:
-                        nn.init.normal_(m.bias, std=1e-6)
-                    else:
-                        nn.init.zeros_(m.bias)
-            else:
-                trunc_normal_(m.weight, std=.02)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-    elif jax_impl and isinstance(m, nn.Conv2d):
-        # NOTE conv was left to pytorch default in my original init
-        lecun_normal_(m.weight)
-        if m.bias is not None:
-            nn.init.zeros_(m.bias)
-    elif isinstance(m, nn.LayerNorm):
-        nn.init.zeros_(m.bias)
-        nn.init.ones_(m.weight)
-
-
-def resize_pos_embed(posemb, posemb_new, num_tokens=1):
-    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
-    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
-    _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
-    ntok_new = posemb_new.shape[1]
-    if num_tokens:
-        posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:]
-        ntok_new -= num_tokens
-    else:
-        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
-    gs_old = int(math.sqrt(len(posemb_grid)))
-    gs_new = int(math.sqrt(ntok_new))
-    _logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
-    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
-    posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
-    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)
-    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
-    return posemb
-
-
-def checkpoint_filter_fn(state_dict, model):
-    """ convert patch embedding weight from manual patchify + linear proj to conv"""
-    out_dict = {}
-    if 'model' in state_dict:
-        # For deit models
-        state_dict = state_dict['model']
-    for k, v in state_dict.items():
-        if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
-            # For old models that I trained prior to conv based patchification
-            O, I, H, W = model.patch_embed.proj.weight.shape
-            v = v.reshape(O, -1, H, W)
-        elif k == 'pos_embed' and v.shape != model.pos_embed.shape:
-            # To resize pos embedding when using model at different size from pretrained weights
-            v = resize_pos_embed(v, model.pos_embed, getattr(model, 'num_tokens', 1))
-        out_dict[k] = v
-    return out_dict
-
-
-def _create_vision_transformer(variant, pretrained=False, default_cfg=None, **kwargs):
-    if default_cfg is None:
-        default_cfg = deepcopy(default_cfgs[variant])
-    overlay_external_default_cfg(default_cfg, kwargs)
-    default_num_classes = default_cfg['num_classes']
-    default_img_size = default_cfg['input_size'][-2:]
-
-    num_classes = kwargs.pop('num_classes', default_num_classes)
-    img_size = kwargs.pop('img_size', default_img_size)
-    repr_size = kwargs.pop('representation_size', None)
-    if repr_size is not None and num_classes != default_num_classes:
-        # Remove representation layer if fine-tuning. This may not always be the desired action,
-        # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
-        _logger.warning("Removing representation layer for fine-tuning.")
-        repr_size = None
-
-    if kwargs.get('features_only', None):
-        raise RuntimeError('features_only not implemented for Vision Transformer models.')
-
-    model = build_model_with_cfg(
-        VisionTransformer, variant, pretrained,
-        default_cfg=default_cfg,
-        img_size=img_size,
-        num_classes=num_classes,
-        representation_size=repr_size,
-        pretrained_filter_fn=checkpoint_filter_fn,
-        **kwargs)
-
-    return model
-
-
-@register_model
-def vit_small_patch16_224(pretrained=False, **kwargs):
-    """ My custom 'small' ViT model. embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.
-    NOTE:
-        * this differs from the DeiT based 'small' definitions with embed_dim=384, depth=12, num_heads=6
-        * this model does not have a bias for QKV (unlike the official ViT and DeiT models)
-    """
-    model_kwargs = dict(
-        patch_size=16, embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.,
-        qkv_bias=False, norm_layer=nn.LayerNorm, **kwargs)
-    if pretrained:
-        # NOTE my scale was wrong for original weights, leaving this here until I have better ones for this model
-        model_kwargs.setdefault('qk_scale', 768 ** -0.5)
-    model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch16_224(pretrained=False, **kwargs):
-    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch32_224(pretrained=False, **kwargs):
-    """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
-    """
-    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch16_384(pretrained=False, **kwargs):
-    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch32_384(pretrained=False, **kwargs):
-    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch16_224(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
-    model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch32_224(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
-    """
-    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
-    model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch16_384(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
-    model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch32_384(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
-    model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch16_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(
-        patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
-    model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_patch32_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(
-        patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
-    model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch16_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(
-        patch_size=16, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
-    model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_patch32_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    model_kwargs = dict(
-        patch_size=32, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
-    model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
-    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    NOTE: converted weights not currently available, too large for github release hosting.
-    """
-    model_kwargs = dict(
-        patch_size=14, embed_dim=1280, depth=32, num_heads=16, representation_size=1280, **kwargs)
-    model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_tiny_patch16_224(pretrained=False, **kwargs):
-    """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
-    model = _create_vision_transformer('vit_deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_small_patch16_224(pretrained=False, **kwargs):
-    """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer('vit_deit_small_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_base_patch16_224(pretrained=False, **kwargs):
-    """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_deit_base_patch16_224', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_base_patch16_384(pretrained=False, **kwargs):
-    """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer('vit_deit_base_patch16_384', pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
-    """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
-    model = _create_vision_transformer(
-        'vit_deit_tiny_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_small_distilled_patch16_224(pretrained=False, **kwargs):
-    """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer(
-        'vit_deit_small_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_base_distilled_patch16_224(pretrained=False, **kwargs):
-    """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer(
-        'vit_deit_base_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs):
-    """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
-    ImageNet-1k weights from https://github.com/facebookresearch/deit.
-    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer(
-        'vit_deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs)
-    return model
diff --git a/AVLFormer/src/timm/models/vision_transformer_hybrid.py b/AVLFormer/src/timm/models/vision_transformer_hybrid.py
deleted file mode 100644
index 890cec9..0000000
--- a/AVLFormer/src/timm/models/vision_transformer_hybrid.py
+++ /dev/null
@@ -1,313 +0,0 @@
-""" Hybrid Vision Transformer (ViT) in PyTorch
-
-A PyTorch implement of the Hybrid Vision Transformers as described in
-'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
-    - https://arxiv.org/abs/2010.11929
-
-NOTE This relies on code in vision_transformer.py. The hybrid model definitions were moved here to
-keep file sizes sane.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from copy import deepcopy
-from functools import partial
-
-import torch
-import torch.nn as nn
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .layers import StdConv2dSame, StdConv2d, to_2tuple
-from .resnet import resnet26d, resnet50d
-from .resnetv2 import ResNetV2, create_resnetv2_stem
-from .registry import register_model
-from src.timm.models.vision_transformer import _create_vision_transformer
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
-        'crop_pct': .9, 'interpolation': 'bicubic',
-        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
-        'first_conv': 'patch_embed.backbone.stem.conv', 'classifier': 'head',
-        **kwargs
-    }
-
-
-default_cfgs = {
-    # hybrid in-21k models (weights ported from official Google JAX impl where they exist)
-    'vit_base_r50_s16_224_in21k': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_224_in21k-6f7c7740.pth',
-        num_classes=21843, crop_pct=0.9),
-
-    # hybrid in-1k models (weights ported from official JAX impl)
-    'vit_base_r50_s16_384': _cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth',
-        input_size=(3, 384, 384), crop_pct=1.0),
-
-    # hybrid in-1k models (mostly untrained, experimental configs w/ resnetv2 stdconv backbones)
-    'vit_tiny_r_s16_p8_224': _cfg(),
-    'vit_small_r_s16_p8_224': _cfg(),
-    'vit_small_r20_s16_p2_224': _cfg(),
-    'vit_small_r20_s16_224': _cfg(),
-    'vit_small_r26_s32_224': _cfg(),
-    'vit_base_r20_s16_224': _cfg(),
-    'vit_base_r26_s32_224': _cfg(),
-    'vit_base_r50_s16_224': _cfg(),
-    'vit_large_r50_s32_224': _cfg(),
-
-    # hybrid models (using timm resnet backbones)
-    'vit_small_resnet26d_224': _cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-    'vit_small_resnet50d_s16_224': _cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-    'vit_base_resnet26d_224': _cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-    'vit_base_resnet50d_224': _cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-}
-
-
-class HybridEmbed(nn.Module):
-    """ CNN Feature Map Embedding
-    Extract feature map from CNN, flatten, project to embedding dim.
-    """
-    def __init__(self, backbone, img_size=224, patch_size=1, feature_size=None, in_chans=3, embed_dim=768):
-        super().__init__()
-        assert isinstance(backbone, nn.Module)
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.backbone = backbone
-        if feature_size is None:
-            with torch.no_grad():
-                # NOTE Most reliable way of determining output dims is to run forward pass
-                training = backbone.training
-                if training:
-                    backbone.eval()
-                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
-                if isinstance(o, (list, tuple)):
-                    o = o[-1]  # last feature if backbone outputs list/tuple of features
-                feature_size = o.shape[-2:]
-                feature_dim = o.shape[1]
-                backbone.train(training)
-        else:
-            feature_size = to_2tuple(feature_size)
-            if hasattr(self.backbone, 'feature_info'):
-                feature_dim = self.backbone.feature_info.channels()[-1]
-            else:
-                feature_dim = self.backbone.num_features
-        assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0
-        self.num_patches = feature_size[0] // patch_size[0] * feature_size[1] // patch_size[1]
-        self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    def forward(self, x):
-        x = self.backbone(x)
-        if isinstance(x, (list, tuple)):
-            x = x[-1]  # last feature if backbone outputs list/tuple of features
-        x = self.proj(x).flatten(2).transpose(1, 2)
-        return x
-
-
-def _create_vision_transformer_hybrid(variant, backbone, pretrained=False, **kwargs):
-    default_cfg = deepcopy(default_cfgs[variant])
-    embed_layer = partial(HybridEmbed, backbone=backbone)
-    kwargs.setdefault('patch_size', 1)  # default patch size for hybrid models if not set
-    return _create_vision_transformer(
-        variant, pretrained=pretrained, default_cfg=default_cfg, embed_layer=embed_layer, **kwargs)
-
-
-def _resnetv2(layers=(3, 4, 9), **kwargs):
-    """ ResNet-V2 backbone helper"""
-    padding_same = kwargs.get('padding_same', True)
-    if padding_same:
-        stem_type = 'same'
-        conv_layer = StdConv2dSame
-    else:
-        stem_type = ''
-        conv_layer = StdConv2d
-    if len(layers):
-        backbone = ResNetV2(
-            layers=layers, num_classes=0, global_pool='', in_chans=kwargs.get('in_chans', 3),
-            preact=False, stem_type=stem_type, conv_layer=conv_layer)
-    else:
-        backbone = create_resnetv2_stem(
-            kwargs.get('in_chans', 3), stem_type=stem_type, preact=False, conv_layer=conv_layer)
-    return backbone
-
-
-@register_model
-def vit_base_r50_s16_224_in21k(pretrained=False, **kwargs):
-    """ R50+ViT-B/16 hybrid model from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
-    """
-    backbone = _resnetv2(layers=(3, 4, 9), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_r50_s16_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_resnet50_224_in21k(pretrained=False, **kwargs):
-    # NOTE this is forwarding to model def above for backwards compatibility
-    return vit_base_r50_s16_224_in21k(pretrained=pretrained, **kwargs)
-
-
-@register_model
-def vit_base_r50_s16_384(pretrained=False, **kwargs):
-    """ R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
-    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
-    """
-    backbone = _resnetv2((3, 4, 9), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_r50_s16_384', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_resnet50_384(pretrained=False, **kwargs):
-    # NOTE this is forwarding to model def above for backwards compatibility
-    return vit_base_r50_s16_384(pretrained=pretrained, **kwargs)
-
-
-@register_model
-def vit_tiny_r_s16_p8_224(pretrained=False, **kwargs):
-    """ R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
-    """
-    backbone = _resnetv2(layers=(), **kwargs)
-    model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_tiny_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_small_r_s16_p8_224(pretrained=False, **kwargs):
-    """ R+ViT-S/S16 w/ 8x8 patch hybrid @ 224 x 224.
-    """
-    backbone = _resnetv2(layers=(), **kwargs)
-    model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-
-    return model
-
-
-@register_model
-def vit_small_r20_s16_p2_224(pretrained=False, **kwargs):
-    """ R52+ViT-S/S16 w/ 2x2 patch hybrid @ 224 x 224.
-    """
-    backbone = _resnetv2((2, 4), **kwargs)
-    model_kwargs = dict(patch_size=2, embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_r20_s16_p2_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_small_r20_s16_224(pretrained=False, **kwargs):
-    """ R20+ViT-S/S16 hybrid.
-    """
-    backbone = _resnetv2((2, 2, 2), **kwargs)
-    model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_r20_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_small_r26_s32_224(pretrained=False, **kwargs):
-    """ R26+ViT-S/S32 hybrid.
-    """
-    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
-    model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_r20_s16_224(pretrained=False, **kwargs):
-    """ R20+ViT-B/S16 hybrid.
-    """
-    backbone = _resnetv2((2, 2, 2), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_r20_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_r26_s32_224(pretrained=False, **kwargs):
-    """ R26+ViT-B/S32 hybrid.
-    """
-    backbone = _resnetv2((2, 2, 2, 2), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_r50_s16_224(pretrained=False, **kwargs):
-    """ R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
-    """
-    backbone = _resnetv2((3, 4, 9), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_r50_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_large_r50_s32_224(pretrained=False, **kwargs):
-    """ R50+ViT-L/S32 hybrid.
-    """
-    backbone = _resnetv2((3, 4, 6, 3), **kwargs)
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_large_r50_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_small_resnet26d_224(pretrained=False, **kwargs):
-    """ Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
-    """
-    backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
-    model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_small_resnet50d_s16_224(pretrained=False, **kwargs):
-    """ Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
-    """
-    backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[3])
-    model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_small_resnet50d_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_resnet26d_224(pretrained=False, **kwargs):
-    """ Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
-    """
-    backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
-
-
-@register_model
-def vit_base_resnet50d_224(pretrained=False, **kwargs):
-    """ Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
-    """
-    backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4])
-    model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs)
-    model = _create_vision_transformer_hybrid(
-        'vit_base_resnet50d_224', backbone=backbone, pretrained=pretrained, **model_kwargs)
-    return model
\ No newline at end of file
diff --git a/AVLFormer/src/timm/models/vovnet.py b/AVLFormer/src/timm/models/vovnet.py
deleted file mode 100644
index 150670d..0000000
--- a/AVLFormer/src/timm/models/vovnet.py
+++ /dev/null
@@ -1,406 +0,0 @@
-""" VoVNet (V1 & V2)
-
-Papers:
-* `An Energy and GPU-Computation Efficient Backbone Network` - https://arxiv.org/abs/1904.09730
-* `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
-
-Looked at  https://github.com/youngwanLEE/vovnet-detectron2 &
-https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
-for some reference, rewrote most of the code.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from typing import List
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .registry import register_model
-from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, DropPath,\
-    create_attn, create_norm_act, get_norm_act_layer
-
-
-# model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 &
-# https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
-model_cfgs = dict(
-    vovnet39a=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 1, 2, 2],
-        residual=False,
-        depthwise=False,
-        attn='',
-    ),
-    vovnet57a=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 1, 4, 3],
-        residual=False,
-        depthwise=False,
-        attn='',
-
-    ),
-    ese_vovnet19b_slim_dw=dict(
-        stem_chs=[64, 64, 64],
-        stage_conv_chs=[64, 80, 96, 112],
-        stage_out_chs=[112, 256, 384, 512],
-        layer_per_block=3,
-        block_per_stage=[1, 1, 1, 1],
-        residual=True,
-        depthwise=True,
-        attn='ese',
-
-    ),
-    ese_vovnet19b_dw=dict(
-        stem_chs=[64, 64, 64],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=3,
-        block_per_stage=[1, 1, 1, 1],
-        residual=True,
-        depthwise=True,
-        attn='ese',
-    ),
-    ese_vovnet19b_slim=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[64, 80, 96, 112],
-        stage_out_chs=[112, 256, 384, 512],
-        layer_per_block=3,
-        block_per_stage=[1, 1, 1, 1],
-        residual=True,
-        depthwise=False,
-        attn='ese',
-    ),
-    ese_vovnet19b=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=3,
-        block_per_stage=[1, 1, 1, 1],
-        residual=True,
-        depthwise=False,
-        attn='ese',
-
-    ),
-    ese_vovnet39b=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 1, 2, 2],
-        residual=True,
-        depthwise=False,
-        attn='ese',
-    ),
-    ese_vovnet57b=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 1, 4, 3],
-        residual=True,
-        depthwise=False,
-        attn='ese',
-
-    ),
-    ese_vovnet99b=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 3, 9, 3],
-        residual=True,
-        depthwise=False,
-        attn='ese',
-    ),
-    eca_vovnet39b=dict(
-        stem_chs=[64, 64, 128],
-        stage_conv_chs=[128, 160, 192, 224],
-        stage_out_chs=[256, 512, 768, 1024],
-        layer_per_block=5,
-        block_per_stage=[1, 1, 2, 2],
-        residual=True,
-        depthwise=False,
-        attn='eca',
-    ),
-)
-model_cfgs['ese_vovnet39b_evos'] = model_cfgs['ese_vovnet39b']
-model_cfgs['ese_vovnet99b_iabn'] = model_cfgs['ese_vovnet99b']
-
-
-def _cfg(url=''):
-    return {
-        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
-        'crop_pct': 0.875, 'interpolation': 'bicubic',
-        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
-        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
-    }
-
-
-default_cfgs = dict(
-    vovnet39a=_cfg(url=''),
-    vovnet57a=_cfg(url=''),
-    ese_vovnet19b_slim_dw=_cfg(url=''),
-    ese_vovnet19b_dw=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet19b_dw-a8741004.pth'),
-    ese_vovnet19b_slim=_cfg(url=''),
-    ese_vovnet39b=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet39b-f912fe73.pth'),
-    ese_vovnet57b=_cfg(url=''),
-    ese_vovnet99b=_cfg(url=''),
-    eca_vovnet39b=_cfg(url=''),
-    ese_vovnet39b_evos=_cfg(url=''),
-    ese_vovnet99b_iabn=_cfg(url=''),
-)
-
-
-class SequentialAppendList(nn.Sequential):
-    def __init__(self, *args):
-        super(SequentialAppendList, self).__init__(*args)
-
-    def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor:
-        for i, module in enumerate(self):
-            if i == 0:
-                concat_list.append(module(x))
-            else:
-                concat_list.append(module(concat_list[-1]))
-        x = torch.cat(concat_list, dim=1)
-        return x
-
-
-class OsaBlock(nn.Module):
-
-    def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
-                 depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path=None):
-        super(OsaBlock, self).__init__()
-
-        self.residual = residual
-        self.depthwise = depthwise
-        conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer)
-
-        next_in_chs = in_chs
-        if self.depthwise and next_in_chs != mid_chs:
-            assert not residual
-            self.conv_reduction = ConvBnAct(next_in_chs, mid_chs, 1, **conv_kwargs)
-        else:
-            self.conv_reduction = None
-
-        mid_convs = []
-        for i in range(layer_per_block):
-            if self.depthwise:
-                conv = SeparableConvBnAct(mid_chs, mid_chs, **conv_kwargs)
-            else:
-                conv = ConvBnAct(next_in_chs, mid_chs, 3, **conv_kwargs)
-            next_in_chs = mid_chs
-            mid_convs.append(conv)
-        self.conv_mid = SequentialAppendList(*mid_convs)
-
-        # feature aggregation
-        next_in_chs = in_chs + layer_per_block * mid_chs
-        self.conv_concat = ConvBnAct(next_in_chs, out_chs, **conv_kwargs)
-
-        if attn:
-            self.attn = create_attn(attn, out_chs)
-        else:
-            self.attn = None
-
-        self.drop_path = drop_path
-
-    def forward(self, x):
-        output = [x]
-        if self.conv_reduction is not None:
-            x = self.conv_reduction(x)
-        x = self.conv_mid(x, output)
-        x = self.conv_concat(x)
-        if self.attn is not None:
-            x = self.attn(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
-        if self.residual:
-            x = x + output[0]
-        return x
-
-
-class OsaStage(nn.Module):
-
-    def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, downsample=True,
-                 residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU,
-                 drop_path_rates=None):
-        super(OsaStage, self).__init__()
-
-        if downsample:
-            self.pool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
-        else:
-            self.pool = None
-
-        blocks = []
-        for i in range(block_per_stage):
-            last_block = i == block_per_stage - 1
-            if drop_path_rates is not None and drop_path_rates[i] > 0.:
-                drop_path = DropPath(drop_path_rates[i])
-            else:
-                drop_path = None
-            blocks += [OsaBlock(
-                in_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, depthwise=depthwise,
-                attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer, drop_path=drop_path)
-            ]
-            in_chs = out_chs
-        self.blocks = nn.Sequential(*blocks)
-
-    def forward(self, x):
-        if self.pool is not None:
-            x = self.pool(x)
-        x = self.blocks(x)
-        return x
-
-
-class VovNet(nn.Module):
-
-    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4,
-                 output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path_rate=0.):
-        """ VovNet (v2)
-        """
-        super(VovNet, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        assert stem_stride in (4, 2)
-        assert output_stride == 32  # FIXME support dilation
-
-        stem_chs = cfg["stem_chs"]
-        stage_conv_chs = cfg["stage_conv_chs"]
-        stage_out_chs = cfg["stage_out_chs"]
-        block_per_stage = cfg["block_per_stage"]
-        layer_per_block = cfg["layer_per_block"]
-        conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer)
-
-        # Stem module
-        last_stem_stride = stem_stride // 2
-        conv_type = SeparableConvBnAct if cfg["depthwise"] else ConvBnAct
-        self.stem = nn.Sequential(*[
-            ConvBnAct(in_chans, stem_chs[0], 3, stride=2, **conv_kwargs),
-            conv_type(stem_chs[0], stem_chs[1], 3, stride=1, **conv_kwargs),
-            conv_type(stem_chs[1], stem_chs[2], 3, stride=last_stem_stride, **conv_kwargs),
-        ])
-        self.feature_info = [dict(
-            num_chs=stem_chs[1], reduction=2, module=f'stem.{1 if stem_stride == 4 else 2}')]
-        current_stride = stem_stride
-
-        # OSA stages
-        stage_dpr = torch.split(torch.linspace(0, drop_path_rate, sum(block_per_stage)), block_per_stage)
-        in_ch_list = stem_chs[-1:] + stage_out_chs[:-1]
-        stage_args = dict(residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], **conv_kwargs)
-        stages = []
-        for i in range(4):  # num_stages
-            downsample = stem_stride == 2 or i > 0  # first stage has no stride/downsample if stem_stride is 4
-            stages += [OsaStage(
-                in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block,
-                downsample=downsample, drop_path_rates=stage_dpr[i], **stage_args)
-            ]
-            self.num_features = stage_out_chs[i]
-            current_stride *= 2 if downsample else 1
-            self.feature_info += [dict(num_chs=self.num_features, reduction=current_stride, module=f'stages.{i}')]
-
-        self.stages = nn.Sequential(*stages)
-
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-        for n, m in self.named_modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1.)
-                nn.init.constant_(m.bias, 0.)
-            elif isinstance(m, nn.Linear):
-                nn.init.zeros_(m.bias)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        return self.stages(x)
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        return self.head(x)
-
-
-def _create_vovnet(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        VovNet, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        model_cfg=model_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True),
-        **kwargs)
-
-
-@register_model
-def vovnet39a(pretrained=False, **kwargs):
-    return _create_vovnet('vovnet39a', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def vovnet57a(pretrained=False, **kwargs):
-    return _create_vovnet('vovnet57a', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet19b_slim_dw(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet19b_slim_dw', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet19b_dw(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet19b_dw', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet19b_slim(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet19b_slim', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet39b(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet39b', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet57b(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet57b', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def ese_vovnet99b(pretrained=False, **kwargs):
-    return _create_vovnet('ese_vovnet99b', pretrained=pretrained, **kwargs)
-
-
-@register_model
-def eca_vovnet39b(pretrained=False, **kwargs):
-    return _create_vovnet('eca_vovnet39b', pretrained=pretrained, **kwargs)
-
-
-# Experimental Models
-
-@register_model
-def ese_vovnet39b_evos(pretrained=False, **kwargs):
-    def norm_act_fn(num_features, **nkwargs):
-        return create_norm_act('EvoNormSample', num_features, jit=False, **nkwargs)
-    return _create_vovnet('ese_vovnet39b_evos', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs)
-
-
-@register_model
-def ese_vovnet99b_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
-    return _create_vovnet(
-        'ese_vovnet99b_iabn', pretrained=pretrained, norm_layer=norm_layer, act_layer=nn.LeakyReLU, **kwargs)
diff --git a/AVLFormer/src/timm/models/xception.py b/AVLFormer/src/timm/models/xception.py
deleted file mode 100644
index 86f558c..0000000
--- a/AVLFormer/src/timm/models/xception.py
+++ /dev/null
@@ -1,232 +0,0 @@
-"""
-Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch)
-
-@author: tstandley
-Adapted by cadene
-
-Creates an Xception Model as defined in:
-
-Francois Chollet
-Xception: Deep Learning with Depthwise Separable Convolutions
-https://arxiv.org/pdf/1610.02357.pdf
-
-This weights ported from the Keras implementation. Achieves the following performance on the validation set:
-
-Loss:0.9173 Prec@1:78.892 Prec@5:94.292
-
-REMEMBER to set your image size to 3x299x299 for both test and validation
-
-normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
-                                  std=[0.5, 0.5, 0.5])
-
-The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
-"""
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .helpers import build_model_with_cfg
-from .layers import create_classifier
-from .registry import register_model
-
-__all__ = ['Xception']
-
-default_cfgs = {
-    'xception': {
-        'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/xception-43020ad28.pth',
-        'input_size': (3, 299, 299),
-        'pool_size': (10, 10),
-        'crop_pct': 0.8975,
-        'interpolation': 'bicubic',
-        'mean': (0.5, 0.5, 0.5),
-        'std': (0.5, 0.5, 0.5),
-        'num_classes': 1000,
-        'first_conv': 'conv1',
-        'classifier': 'fc'
-        # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
-    }
-}
-
-
-class SeparableConv2d(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1):
-        super(SeparableConv2d, self).__init__()
-
-        self.conv1 = nn.Conv2d(
-            in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels, bias=False)
-        self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=False)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.pointwise(x)
-        return x
-
-
-class Block(nn.Module):
-    def __init__(self, in_channels, out_channels, reps, strides=1, start_with_relu=True, grow_first=True):
-        super(Block, self).__init__()
-
-        if out_channels != in_channels or strides != 1:
-            self.skip = nn.Conv2d(in_channels, out_channels, 1, stride=strides, bias=False)
-            self.skipbn = nn.BatchNorm2d(out_channels)
-        else:
-            self.skip = None
-
-        rep = []
-        for i in range(reps):
-            if grow_first:
-                inc = in_channels if i == 0 else out_channels
-                outc = out_channels
-            else:
-                inc = in_channels
-                outc = in_channels if i < (reps - 1) else out_channels
-            rep.append(nn.ReLU(inplace=True))
-            rep.append(SeparableConv2d(inc, outc, 3, stride=1, padding=1))
-            rep.append(nn.BatchNorm2d(outc))
-
-        if not start_with_relu:
-            rep = rep[1:]
-        else:
-            rep[0] = nn.ReLU(inplace=False)
-
-        if strides != 1:
-            rep.append(nn.MaxPool2d(3, strides, 1))
-        self.rep = nn.Sequential(*rep)
-
-    def forward(self, inp):
-        x = self.rep(inp)
-
-        if self.skip is not None:
-            skip = self.skip(inp)
-            skip = self.skipbn(skip)
-        else:
-            skip = inp
-
-        x += skip
-        return x
-
-
-class Xception(nn.Module):
-    """
-    Xception optimized for the ImageNet dataset, as specified in
-    https://arxiv.org/pdf/1610.02357.pdf
-    """
-
-    def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg'):
-        """ Constructor
-        Args:
-            num_classes: number of classes
-        """
-        super(Xception, self).__init__()
-        self.drop_rate = drop_rate
-        self.global_pool = global_pool
-        self.num_classes = num_classes
-        self.num_features = 2048
-
-        self.conv1 = nn.Conv2d(in_chans, 32, 3, 2, 0, bias=False)
-        self.bn1 = nn.BatchNorm2d(32)
-        self.act1 = nn.ReLU(inplace=True)
-
-        self.conv2 = nn.Conv2d(32, 64, 3, bias=False)
-        self.bn2 = nn.BatchNorm2d(64)
-        self.act2 = nn.ReLU(inplace=True)
-
-        self.block1 = Block(64, 128, 2, 2, start_with_relu=False)
-        self.block2 = Block(128, 256, 2, 2)
-        self.block3 = Block(256, 728, 2, 2)
-
-        self.block4 = Block(728, 728, 3, 1)
-        self.block5 = Block(728, 728, 3, 1)
-        self.block6 = Block(728, 728, 3, 1)
-        self.block7 = Block(728, 728, 3, 1)
-
-        self.block8 = Block(728, 728, 3, 1)
-        self.block9 = Block(728, 728, 3, 1)
-        self.block10 = Block(728, 728, 3, 1)
-        self.block11 = Block(728, 728, 3, 1)
-
-        self.block12 = Block(728, 1024, 2, 2, grow_first=False)
-
-        self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1)
-        self.bn3 = nn.BatchNorm2d(1536)
-        self.act3 = nn.ReLU(inplace=True)
-
-        self.conv4 = SeparableConv2d(1536, self.num_features, 3, 1, 1)
-        self.bn4 = nn.BatchNorm2d(self.num_features)
-        self.act4 = nn.ReLU(inplace=True)
-        self.feature_info = [
-            dict(num_chs=64, reduction=2, module='act2'),
-            dict(num_chs=128, reduction=4, module='block2.rep.0'),
-            dict(num_chs=256, reduction=8, module='block3.rep.0'),
-            dict(num_chs=728, reduction=16, module='block12.rep.0'),
-            dict(num_chs=2048, reduction=32, module='act4'),
-        ]
-
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-        # #------- init weights --------
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def get_classifier(self):
-        return self.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.num_classes = num_classes
-        self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool)
-
-    def forward_features(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.act1(x)
-
-        x = self.conv2(x)
-        x = self.bn2(x)
-        x = self.act2(x)
-
-        x = self.block1(x)
-        x = self.block2(x)
-        x = self.block3(x)
-        x = self.block4(x)
-        x = self.block5(x)
-        x = self.block6(x)
-        x = self.block7(x)
-        x = self.block8(x)
-        x = self.block9(x)
-        x = self.block10(x)
-        x = self.block11(x)
-        x = self.block12(x)
-
-        x = self.conv3(x)
-        x = self.bn3(x)
-        x = self.act3(x)
-
-        x = self.conv4(x)
-        x = self.bn4(x)
-        x = self.act4(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.global_pool(x)
-        if self.drop_rate:
-            F.dropout(x, self.drop_rate, training=self.training)
-        x = self.fc(x)
-        return x
-
-
-def _xception(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        Xception, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(feature_cls='hook'),
-        **kwargs)
-
-
-@register_model
-def xception(pretrained=False, **kwargs):
-    return _xception('xception', pretrained=pretrained, **kwargs)
diff --git a/AVLFormer/src/timm/models/xception_aligned.py b/AVLFormer/src/timm/models/xception_aligned.py
deleted file mode 100644
index 822b042..0000000
--- a/AVLFormer/src/timm/models/xception_aligned.py
+++ /dev/null
@@ -1,238 +0,0 @@
-"""Pytorch impl of Aligned Xception 41, 65, 71
-
-This is a correct, from scratch impl of Aligned Xception (Deeplab) models compatible with TF weights at
-https://github.com/tensorflow/models/blob/master/research/deeplab/g3doc/model_zoo.md
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from functools import partial
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from src.timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, create_conv2d
-from .layers.helpers import to_3tuple
-from .registry import register_model
-
-__all__ = ['XceptionAligned']
-
-
-def _cfg(url='', **kwargs):
-    return {
-        'url': url,
-        'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (10, 10),
-        'crop_pct': 0.903, 'interpolation': 'bicubic',
-        'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
-        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
-        **kwargs
-    }
-
-
-default_cfgs = dict(
-    xception41=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_41-e6439c97.pth'),
-    xception65=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_65-c9ae96e8.pth'),
-    xception71=_cfg(
-        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_71-8eec7df1.pth'),
-)
-
-
-class SeparableConv2d(nn.Module):
-    def __init__(
-            self, inplanes, planes, kernel_size=3, stride=1, dilation=1, padding='',
-            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
-        super(SeparableConv2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.dilation = dilation
-
-        # depthwise convolution
-        self.conv_dw = create_conv2d(
-            inplanes, inplanes, kernel_size, stride=stride,
-            padding=padding, dilation=dilation, depthwise=True)
-        self.bn_dw = norm_layer(inplanes)
-        if act_layer is not None:
-            self.act_dw = act_layer(inplace=True)
-        else:
-            self.act_dw = None
-
-        # pointwise convolution
-        self.conv_pw = create_conv2d(inplanes, planes, kernel_size=1)
-        self.bn_pw = norm_layer(planes)
-        if act_layer is not None:
-            self.act_pw = act_layer(inplace=True)
-        else:
-            self.act_pw = None
-
-    def forward(self, x):
-        x = self.conv_dw(x)
-        x = self.bn_dw(x)
-        if self.act_dw is not None:
-            x = self.act_dw(x)
-        x = self.conv_pw(x)
-        x = self.bn_pw(x)
-        if self.act_pw is not None:
-            x = self.act_pw(x)
-        return x
-
-
-class XceptionModule(nn.Module):
-    def __init__(
-            self, in_chs, out_chs, stride=1, dilation=1, pad_type='',
-            start_with_relu=True, no_skip=False, act_layer=nn.ReLU, norm_layer=None):
-        super(XceptionModule, self).__init__()
-        out_chs = to_3tuple(out_chs)
-        self.in_channels = in_chs
-        self.out_channels = out_chs[-1]
-        self.no_skip = no_skip
-        if not no_skip and (self.out_channels != self.in_channels or stride != 1):
-            self.shortcut = ConvBnAct(
-                in_chs, self.out_channels, 1, stride=stride, norm_layer=norm_layer, act_layer=None)
-        else:
-            self.shortcut = None
-
-        separable_act_layer = None if start_with_relu else act_layer
-        self.stack = nn.Sequential()
-        for i in range(3):
-            if start_with_relu:
-                self.stack.add_module(f'act{i + 1}', nn.ReLU(inplace=i > 0))
-            self.stack.add_module(f'conv{i + 1}', SeparableConv2d(
-                in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type,
-                act_layer=separable_act_layer, norm_layer=norm_layer))
-            in_chs = out_chs[i]
-
-    def forward(self, x):
-        skip = x
-        x = self.stack(x)
-        if self.shortcut is not None:
-            skip = self.shortcut(skip)
-        if not self.no_skip:
-            x = x + skip
-        return x
-
-
-class XceptionAligned(nn.Module):
-    """Modified Aligned Xception
-    """
-
-    def __init__(self, block_cfg, num_classes=1000, in_chans=3, output_stride=32,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0., global_pool='avg'):
-        super(XceptionAligned, self).__init__()
-        self.num_classes = num_classes
-        self.drop_rate = drop_rate
-        assert output_stride in (8, 16, 32)
-
-        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
-        self.stem = nn.Sequential(*[
-            ConvBnAct(in_chans, 32, kernel_size=3, stride=2, **layer_args),
-            ConvBnAct(32, 64, kernel_size=3, stride=1, **layer_args)
-        ])
-
-        curr_dilation = 1
-        curr_stride = 2
-        self.feature_info = []
-        self.blocks = nn.Sequential()
-        for i, b in enumerate(block_cfg):
-            b['dilation'] = curr_dilation
-            if b['stride'] > 1:
-                self.feature_info += [dict(
-                    num_chs=to_3tuple(b['out_chs'])[-2], reduction=curr_stride, module=f'blocks.{i}.stack.act3')]
-                next_stride = curr_stride * b['stride']
-                if next_stride > output_stride:
-                    curr_dilation *= b['stride']
-                    b['stride'] = 1
-                else:
-                    curr_stride = next_stride
-            self.blocks.add_module(str(i), XceptionModule(**b, **layer_args))
-            self.num_features = self.blocks[-1].out_channels
-
-        self.feature_info += [dict(
-            num_chs=self.num_features, reduction=curr_stride, module='blocks.' + str(len(self.blocks) - 1))]
-
-        self.head = ClassifierHead(
-            in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
-
-    def get_classifier(self):
-        return self.head.fc
-
-    def reset_classifier(self, num_classes, global_pool='avg'):
-        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-
-    def forward_features(self, x):
-        x = self.stem(x)
-        x = self.blocks(x)
-        return x
-
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
-
-
-def _xception(variant, pretrained=False, **kwargs):
-    return build_model_with_cfg(
-        XceptionAligned, variant, pretrained,
-        default_cfg=default_cfgs[variant],
-        feature_cfg=dict(flatten_sequential=True, feature_cls='hook'),
-        **kwargs)
-
-
-@register_model
-def xception41(pretrained=False, **kwargs):
-    """ Modified Aligned Xception-41
-    """
-    block_cfg = [
-        # entry flow
-        dict(in_chs=64, out_chs=128, stride=2),
-        dict(in_chs=128, out_chs=256, stride=2),
-        dict(in_chs=256, out_chs=728, stride=2),
-        # middle flow
-        *([dict(in_chs=728, out_chs=728, stride=1)] * 8),
-        # exit flow
-        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
-        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
-    ]
-    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
-    return _xception('xception41', pretrained=pretrained, **model_args)
-
-
-@register_model
-def xception65(pretrained=False, **kwargs):
-    """ Modified Aligned Xception-65
-    """
-    block_cfg = [
-        # entry flow
-        dict(in_chs=64, out_chs=128, stride=2),
-        dict(in_chs=128, out_chs=256, stride=2),
-        dict(in_chs=256, out_chs=728, stride=2),
-        # middle flow
-        *([dict(in_chs=728, out_chs=728, stride=1)] * 16),
-        # exit flow
-        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
-        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
-    ]
-    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
-    return _xception('xception65', pretrained=pretrained, **model_args)
-
-
-@register_model
-def xception71(pretrained=False, **kwargs):
-    """ Modified Aligned Xception-71
-    """
-    block_cfg = [
-        # entry flow
-        dict(in_chs=64, out_chs=128, stride=2),
-        dict(in_chs=128, out_chs=256, stride=1),
-        dict(in_chs=256, out_chs=256, stride=2),
-        dict(in_chs=256, out_chs=728, stride=1),
-        dict(in_chs=728, out_chs=728, stride=2),
-        # middle flow
-        *([dict(in_chs=728, out_chs=728, stride=1)] * 16),
-        # exit flow
-        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
-        dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False),
-    ]
-    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
-    return _xception('xception71', pretrained=pretrained, **model_args)
diff --git a/AVLFormer/src/timm/utils/__init__.py b/AVLFormer/src/timm/utils/__init__.py
deleted file mode 100644
index 0f7c4b0..0000000
--- a/AVLFormer/src/timm/utils/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .checkpoint_saver import CheckpointSaver
-from .cuda import ApexScaler, NativeScaler
-from .distributed import distribute_bn, reduce_tensor
-from .jit import set_jit_legacy
-from .log import setup_default_logging, FormatterNoInfo
-from .metrics import AverageMeter, accuracy
-from .misc import natural_key, add_bool_arg
-from .model import unwrap_model, get_state_dict
-from .model_ema import ModelEma, ModelEmaV2
-from .summary import update_summary, get_outdir
diff --git a/AVLFormer/src/timm/utils/checkpoint_saver.py b/AVLFormer/src/timm/utils/checkpoint_saver.py
deleted file mode 100644
index 6aad74e..0000000
--- a/AVLFormer/src/timm/utils/checkpoint_saver.py
+++ /dev/null
@@ -1,150 +0,0 @@
-""" Checkpoint Saver
-
-Track top-n training checkpoints and maintain recovery checkpoints on specified intervals.
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-import glob
-import operator
-import os
-import logging
-
-import torch
-
-from .model import unwrap_model, get_state_dict
-
-
-_logger = logging.getLogger(__name__)
-
-
-class CheckpointSaver:
-    def __init__(
-            self,
-            model,
-            optimizer,
-            args=None,
-            model_ema=None,
-            amp_scaler=None,
-            checkpoint_prefix='checkpoint',
-            recovery_prefix='recovery',
-            checkpoint_dir='',
-            recovery_dir='',
-            decreasing=False,
-            max_history=10,
-            unwrap_fn=unwrap_model):
-
-        # objects to save state_dicts of
-        self.model = model
-        self.optimizer = optimizer
-        self.args = args
-        self.model_ema = model_ema
-        self.amp_scaler = amp_scaler
-
-        # state
-        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
-        self.best_epoch = None
-        self.best_metric = None
-        self.curr_recovery_file = ''
-        self.last_recovery_file = ''
-
-        # config
-        self.checkpoint_dir = checkpoint_dir
-        self.recovery_dir = recovery_dir
-        self.save_prefix = checkpoint_prefix
-        self.recovery_prefix = recovery_prefix
-        self.extension = '.pth.tar'
-        self.decreasing = decreasing  # a lower metric is better if True
-        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
-        self.max_history = max_history
-        self.unwrap_fn = unwrap_fn
-        assert self.max_history >= 1
-
-    def save_checkpoint(self, epoch, metric=None):
-        assert epoch >= 0
-        tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
-        last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
-        self._save(tmp_save_path, epoch, metric)
-        if os.path.exists(last_save_path):
-            os.unlink(last_save_path)  # required for Windows support.
-        os.rename(tmp_save_path, last_save_path)
-        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
-        if (len(self.checkpoint_files) < self.max_history
-                or metric is None or self.cmp(metric, worst_file[1])):
-            if len(self.checkpoint_files) >= self.max_history:
-                self._cleanup_checkpoints(1)
-            filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
-            save_path = os.path.join(self.checkpoint_dir, filename)
-            os.link(last_save_path, save_path)
-            self.checkpoint_files.append((save_path, metric))
-            self.checkpoint_files = sorted(
-                self.checkpoint_files, key=lambda x: x[1],
-                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
-
-            checkpoints_str = "Current checkpoints:\n"
-            for c in self.checkpoint_files:
-                checkpoints_str += ' {}\n'.format(c)
-            _logger.info(checkpoints_str)
-
-            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
-                self.best_epoch = epoch
-                self.best_metric = metric
-                best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
-                if os.path.exists(best_save_path):
-                    os.unlink(best_save_path)
-                os.link(last_save_path, best_save_path)
-
-        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
-
-    def _save(self, save_path, epoch, metric=None):
-        save_state = {
-            'epoch': epoch,
-            'arch': type(self.model).__name__.lower(),
-            'state_dict': get_state_dict(self.model, self.unwrap_fn),
-            'optimizer': self.optimizer.state_dict(),
-            'version': 2,  # version < 2 increments epoch before save
-        }
-        if self.args is not None:
-            save_state['arch'] = self.args.model
-            save_state['args'] = self.args
-        if self.amp_scaler is not None:
-            save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict()
-        if self.model_ema is not None:
-            save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
-        if metric is not None:
-            save_state['metric'] = metric
-        torch.save(save_state, save_path)
-
-    def _cleanup_checkpoints(self, trim=0):
-        trim = min(len(self.checkpoint_files), trim)
-        delete_index = self.max_history - trim
-        if delete_index < 0 or len(self.checkpoint_files) <= delete_index:
-            return
-        to_delete = self.checkpoint_files[delete_index:]
-        for d in to_delete:
-            try:
-                _logger.debug("Cleaning checkpoint: {}".format(d))
-                os.remove(d[0])
-            except Exception as e:
-                _logger.error("Exception '{}' while deleting checkpoint".format(e))
-        self.checkpoint_files = self.checkpoint_files[:delete_index]
-
-    def save_recovery(self, epoch, batch_idx=0):
-        assert epoch >= 0
-        filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
-        save_path = os.path.join(self.recovery_dir, filename)
-        self._save(save_path, epoch)
-        if os.path.exists(self.last_recovery_file):
-            try:
-                _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
-                os.remove(self.last_recovery_file)
-            except Exception as e:
-                _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
-        self.last_recovery_file = self.curr_recovery_file
-        self.curr_recovery_file = save_path
-
-    def find_recovery(self):
-        recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
-        files = glob.glob(recovery_path + '*' + self.extension)
-        files = sorted(files)
-        return files[0] if len(files) else ''
diff --git a/AVLFormer/src/timm/utils/cuda.py b/AVLFormer/src/timm/utils/cuda.py
deleted file mode 100644
index bcd29f5..0000000
--- a/AVLFormer/src/timm/utils/cuda.py
+++ /dev/null
@@ -1,53 +0,0 @@
-""" CUDA / AMP utils
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-
-try:
-    from apex import amp
-    has_apex = True
-except ImportError:
-    amp = None
-    has_apex = False
-
-
-class ApexScaler:
-    state_dict_key = "amp"
-
-    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False):
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward(create_graph=create_graph)
-        if clip_grad is not None:
-            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), clip_grad)
-        optimizer.step()
-
-    def state_dict(self):
-        if 'state_dict' in amp.__dict__:
-            return amp.state_dict()
-
-    def load_state_dict(self, state_dict):
-        if 'load_state_dict' in amp.__dict__:
-            amp.load_state_dict(state_dict)
-
-
-class NativeScaler:
-    state_dict_key = "amp_scaler"
-
-    def __init__(self):
-        self._scaler = torch.cuda.amp.GradScaler()
-
-    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False):
-        self._scaler.scale(loss).backward(create_graph=create_graph)
-        if clip_grad is not None:
-            assert parameters is not None
-            self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
-            torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
-        self._scaler.step(optimizer)
-        self._scaler.update()
-
-    def state_dict(self):
-        return self._scaler.state_dict()
-
-    def load_state_dict(self, state_dict):
-        self._scaler.load_state_dict(state_dict)
diff --git a/AVLFormer/src/timm/utils/distributed.py b/AVLFormer/src/timm/utils/distributed.py
deleted file mode 100644
index 3c5dba8..0000000
--- a/AVLFormer/src/timm/utils/distributed.py
+++ /dev/null
@@ -1,28 +0,0 @@
-""" Distributed training/validation utils
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-from torch import distributed as dist
-
-from .model import unwrap_model
-
-
-def reduce_tensor(tensor, n):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= n
-    return rt
-
-
-def distribute_bn(model, world_size, reduce=False):
-    # ensure every node has the same running bn stats
-    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
-        if ('running_mean' in bn_name) or ('running_var' in bn_name):
-            if reduce:
-                # average bn stats across whole group
-                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
-                bn_buf /= float(world_size)
-            else:
-                # broadcast bn stats from rank 0 to whole group
-                torch.distributed.broadcast(bn_buf, 0)
diff --git a/AVLFormer/src/timm/utils/jit.py b/AVLFormer/src/timm/utils/jit.py
deleted file mode 100644
index 185ab7a..0000000
--- a/AVLFormer/src/timm/utils/jit.py
+++ /dev/null
@@ -1,18 +0,0 @@
-""" JIT scripting/tracing utils
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-
-
-def set_jit_legacy():
-    """ Set JIT executor to legacy w/ support for op fusion
-    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
-    in the JIT exectutor. These API are not supported so could change.
-    """
-    #
-    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-    #torch._C._jit_set_texpr_fuser_enabled(True)
diff --git a/AVLFormer/src/timm/utils/log.py b/AVLFormer/src/timm/utils/log.py
deleted file mode 100644
index c99469e..0000000
--- a/AVLFormer/src/timm/utils/log.py
+++ /dev/null
@@ -1,28 +0,0 @@
-""" Logging helpers
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import logging
-import logging.handlers
-
-
-class FormatterNoInfo(logging.Formatter):
-    def __init__(self, fmt='%(levelname)s: %(message)s'):
-        logging.Formatter.__init__(self, fmt)
-
-    def format(self, record):
-        if record.levelno == logging.INFO:
-            return str(record.getMessage())
-        return logging.Formatter.format(self, record)
-
-
-def setup_default_logging(default_level=logging.INFO, log_path=''):
-    console_handler = logging.StreamHandler()
-    console_handler.setFormatter(FormatterNoInfo())
-    logging.root.addHandler(console_handler)
-    logging.root.setLevel(default_level)
-    if log_path:
-        file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3)
-        file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s")
-        file_handler.setFormatter(file_formatter)
-        logging.root.addHandler(file_handler)
diff --git a/AVLFormer/src/timm/utils/metrics.py b/AVLFormer/src/timm/utils/metrics.py
deleted file mode 100644
index 8e0b1f9..0000000
--- a/AVLFormer/src/timm/utils/metrics.py
+++ /dev/null
@@ -1,32 +0,0 @@
-""" Eval metrics and related
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-
-class AverageMeter:
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    maxk = max(topk)
-    batch_size = target.size(0)
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
-    return [correct[:k].reshape(-1).float().sum(0) * 100. / batch_size for k in topk]
diff --git a/AVLFormer/src/timm/utils/misc.py b/AVLFormer/src/timm/utils/misc.py
deleted file mode 100644
index 39c0097..0000000
--- a/AVLFormer/src/timm/utils/misc.py
+++ /dev/null
@@ -1,18 +0,0 @@
-""" Misc utils
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import re
-
-
-def natural_key(string_):
-    """See http://www.codinghorror.com/blog/archives/001018.html"""
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
-
-
-def add_bool_arg(parser, name, default=False, help=''):
-    dest_name = name.replace('-', '_')
-    group = parser.add_mutually_exclusive_group(required=False)
-    group.add_argument('--' + name, dest=dest_name, action='store_true', help=help)
-    group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help)
-    parser.set_defaults(**{dest_name: default})
diff --git a/AVLFormer/src/timm/utils/model.py b/AVLFormer/src/timm/utils/model.py
deleted file mode 100644
index cfd4280..0000000
--- a/AVLFormer/src/timm/utils/model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-""" Model / state_dict utils
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-from .model_ema import ModelEma
-
-
-def unwrap_model(model):
-    if isinstance(model, ModelEma):
-        return unwrap_model(model.ema)
-    else:
-        return model.module if hasattr(model, 'module') else model
-
-
-def get_state_dict(model, unwrap_fn=unwrap_model):
-    return unwrap_fn(model).state_dict()
diff --git a/AVLFormer/src/timm/utils/model_ema.py b/AVLFormer/src/timm/utils/model_ema.py
deleted file mode 100644
index 073d5c5..0000000
--- a/AVLFormer/src/timm/utils/model_ema.py
+++ /dev/null
@@ -1,126 +0,0 @@
-""" Exponential Moving Average (EMA) of model updates
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import logging
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-
-_logger = logging.getLogger(__name__)
-
-
-class ModelEma:
-    """ Model Exponential Moving Average (DEPRECATED)
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    This version is deprecated, it does not work with scripted models. Will be removed eventually.
-
-    This is intended to allow functionality like
-    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-
-    A smoothed version of the weights is necessary for some training schemes to perform well.
-    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
-    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
-    smoothing of weights to match results. Pay attention to the decay constant you are using
-    relative to your update count per epoch.
-
-    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
-    disable validation of the EMA weights. Validation will have to be done manually in a separate
-    process, or after the training stops converging.
-
-    This class is sensitive where it is initialized in the sequence of model init,
-    GPU assignment and distributed training wrappers.
-    """
-    def __init__(self, model, decay=0.9999, device='', resume=''):
-        # make a copy of the model for accumulating moving average of weights
-        self.ema = deepcopy(model)
-        self.ema.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if device:
-            self.ema.to(device=device)
-        self.ema_has_module = hasattr(self.ema, 'module')
-        if resume:
-            self._load_checkpoint(resume)
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-
-    def _load_checkpoint(self, checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location='cpu')
-        assert isinstance(checkpoint, dict)
-        if 'state_dict_ema' in checkpoint:
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint['state_dict_ema'].items():
-                # ema model may have been wrapped by DataParallel, and need module prefix
-                if self.ema_has_module:
-                    name = 'module.' + k if not k.startswith('module') else k
-                else:
-                    name = k
-                new_state_dict[name] = v
-            self.ema.load_state_dict(new_state_dict)
-            _logger.info("Loaded state_dict_ema")
-        else:
-            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
-
-    def update(self, model):
-        # correct a mismatch in state dict keys
-        needs_module = hasattr(model, 'module') and not self.ema_has_module
-        with torch.no_grad():
-            msd = model.state_dict()
-            for k, ema_v in self.ema.state_dict().items():
-                if needs_module:
-                    k = 'module.' + k
-                model_v = msd[k].detach()
-                if self.device:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
-
-
-class ModelEmaV2(nn.Module):
-    """ Model Exponential Moving Average V2
-
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    V2 of this module is simpler, it does not match params/buffers based on name but simply
-    iterates in order. It works with torchscript (JIT of full model).
-
-    This is intended to allow functionality like
-    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-
-    A smoothed version of the weights is necessary for some training schemes to perform well.
-    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
-    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
-    smoothing of weights to match results. Pay attention to the decay constant you are using
-    relative to your update count per epoch.
-
-    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
-    disable validation of the EMA weights. Validation will have to be done manually in a separate
-    process, or after the training stops converging.
-
-    This class is sensitive where it is initialized in the sequence of model init,
-    GPU assignment and distributed training wrappers.
-    """
-    def __init__(self, model, decay=0.9999, device=None):
-        super(ModelEmaV2, self).__init__()
-        # make a copy of the model for accumulating moving average of weights
-        self.module = deepcopy(model)
-        self.module.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if self.device is not None:
-            self.module.to(device=device)
-
-    def _update(self, model, update_fn):
-        with torch.no_grad():
-            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                if self.device is not None:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(update_fn(ema_v, model_v))
-
-    def update(self, model):
-        self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
-
-    def set(self, model):
-        self._update(model, update_fn=lambda e, m: m)
diff --git a/AVLFormer/src/timm/utils/summary.py b/AVLFormer/src/timm/utils/summary.py
deleted file mode 100644
index a0801ea..0000000
--- a/AVLFormer/src/timm/utils/summary.py
+++ /dev/null
@@ -1,34 +0,0 @@
-""" Summary utilities
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import csv
-import os
-from collections import OrderedDict
-
-
-def get_outdir(path, *paths, inc=False):
-    outdir = os.path.join(path, *paths)
-    if not os.path.exists(outdir):
-        os.makedirs(outdir)
-    elif inc:
-        count = 1
-        outdir_inc = outdir + '-' + str(count)
-        while os.path.exists(outdir_inc):
-            count = count + 1
-            outdir_inc = outdir + '-' + str(count)
-            assert count < 100
-        outdir = outdir_inc
-        os.makedirs(outdir)
-    return outdir
-
-
-def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
-    rowd = OrderedDict(epoch=epoch)
-    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
-    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
-    with open(filename, mode='a') as cf:
-        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
-        if write_header:  # first iteration (epoch == 1 can't be used)
-            dw.writeheader()
-        dw.writerow(rowd)
diff --git a/AVLFormer/src/timm/version.py b/AVLFormer/src/timm/version.py
deleted file mode 100644
index f0ede3d..0000000
--- a/AVLFormer/src/timm/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '0.4.1'
diff --git a/AVLFormer/src/utils/__init__.py b/AVLFormer/src/utils/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/AVLFormer/src/utils/basic_utils.py b/AVLFormer/src/utils/basic_utils.py
deleted file mode 100644
index 34c1c2d..0000000
--- a/AVLFormer/src/utils/basic_utils.py
+++ /dev/null
@@ -1,206 +0,0 @@
-import json
-import os
-import pickle
-import zipfile
-
-import numpy as np
-
-
-def load_pickle(filename):
-    with open(filename, "rb") as f:
-        return pickle.load(f)
-
-
-def save_pickle(data, filename):
-    with open(filename, "wb") as f:
-        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
-
-
-def is_jsonable(x):
-    try:
-        json.dumps(x)
-        return True
-    except:
-        return False
-
-
-def load_json(filename):
-    with open(filename, "r") as f:
-        return json.load(f)
-
-
-def save_json(data, filename, save_pretty=False, sort_keys=False):
-    with open(filename, "w") as f:
-        if save_pretty:
-            f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
-        else:
-            json.dump(data, f)
-
-
-def load_jsonl(filename):
-    with open(filename, "r") as f:
-        return [json.loads(l.strip("\n")) for l in f.readlines()]
-
-
-def save_jsonl(data, filename):
-    """data is a list"""
-    with open(filename, "w") as f:
-        f.write("\n".join([json.dumps(e) for e in data]))
-
-
-def concat_json_list(filepaths, save_path):
-    json_lists = []
-    for p in filepaths:
-        json_lists += load_json(p)
-    save_json(json_lists, save_path)
-
-
-def save_lines(list_of_str, filepath):
-    with open(filepath, "w") as f:
-        f.write("\n".join(list_of_str))
-
-
-def read_lines(filepath):
-    with open(filepath, "r") as f:
-        return [e.strip("\n") for e in f.readlines()]
-
-
-def mkdirp(p):
-    if not os.path.exists(p):
-        os.makedirs(p)
-
-
-def flat_list_of_lists(l):
-    """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]"""
-    return [item for sublist in l for item in sublist]
-
-
-def convert_to_seconds(hms_time):
-    """ convert '00:01:12' to 72 seconds.
-    :hms_time (str): time in comma separated string, e.g. '00:01:12'
-    :return (int): time in seconds, e.g. 72
-    """
-    times = [float(t) for t in hms_time.split(":")]
-    return times[0] * 3600 + times[1] * 60 + times[2]
-
-
-def get_video_name_from_url(url):
-    return url.split("/")[-1][:-4]
-
-
-def merge_dicts(list_dicts):
-    merged_dict = list_dicts[0].copy()
-    for i in range(1, len(list_dicts)):
-        merged_dict.update(list_dicts[i])
-    return merged_dict
-
-
-def l2_normalize_np_array(np_array, eps=1e-5):
-    """np_array: np.ndarray, (*, D), where the last dim will be normalized"""
-    return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps)
-
-
-def make_zipfile(src_dir,
-                 save_path,
-                 enclosing_dir="",
-                 exclude_dirs=None,
-                 exclude_extensions=None,
-                 exclude_dirs_substring=None):
-    """make a zip file of root_dir, save it to save_path.
-    exclude_paths will be excluded if it is a subdir of root_dir.
-    An enclosing_dir is added is specified.
-    """
-    abs_src = os.path.abspath(src_dir)
-    with zipfile.ZipFile(save_path, "w") as zf:
-        for dirname, subdirs, files in os.walk(src_dir):
-            if exclude_dirs is not None:
-                for e_p in exclude_dirs:
-                    if e_p in subdirs:
-                        subdirs.remove(e_p)
-            if exclude_dirs_substring is not None:
-                to_rm = []
-                for d in subdirs:
-                    if exclude_dirs_substring in d:
-                        to_rm.append(d)
-                for e in to_rm:
-                    subdirs.remove(e)
-            arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:])
-            zf.write(dirname, arcname)
-            for filename in files:
-                if exclude_extensions is not None:
-                    if os.path.splitext(filename)[1] in exclude_extensions:
-                        continue  # do not zip it
-                absname = os.path.join(dirname, filename)
-                arcname = os.path.join(enclosing_dir,
-                                       absname[len(abs_src) + 1:])
-                zf.write(absname, arcname)
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current/max/min value"""
-
-    def __init__(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-        self.max = -1e10
-        self.min = 1e10
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-        self.max = -1e10
-        self.min = 1e10
-
-    def update(self, val, n=1):
-        self.max = max(val, self.max)
-        self.min = min(val, self.min)
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True):
-    """Dissect an array (N, D) into a list a sub-array,
-    np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept"""
-    if assert_equal:
-        assert len(np_array) == sum(lengths)
-    length_indices = [
-        0,
-    ]
-    for i in range(len(lengths)):
-        length_indices.append(length_indices[i] + lengths[i])
-    if dim == 0:
-        array_list = [
-            np_array[length_indices[i]:length_indices[i + 1]]
-            for i in range(len(lengths))
-        ]
-    elif dim == 1:
-        array_list = [
-            np_array[:, length_indices[i]:length_indices[i + 1]]
-            for i in range(len(lengths))
-        ]
-    elif dim == 2:
-        array_list = [
-            np_array[:, :, length_indices[i]:length_indices[i + 1]]
-            for i in range(len(lengths))
-        ]
-    else:
-        raise NotImplementedError
-    return array_list
-
-
-def get_ratio_from_counter(counter_obj, threshold=200):
-    keys = counter_obj.keys()
-    values = counter_obj.values()
-    filtered_values = [counter_obj[k] for k in keys if k > threshold]
-    return float(sum(filtered_values)) / sum(values)
-
-
-def get_rounded_percentage(float_number, n_floats=2):
-    return round(float_number * 100, n_floats)
diff --git a/AVLFormer/src/utils/cloud_storage.py b/AVLFormer/src/utils/cloud_storage.py
deleted file mode 100644
index 52e2844..0000000
--- a/AVLFormer/src/utils/cloud_storage.py
+++ /dev/null
@@ -1,915 +0,0 @@
-import glob
-import logging
-import os
-import os.path as op
-from pprint import pformat
-import sys
-import time
-
-from deprecated import deprecated
-from tqdm import tqdm
-
-from .qd_common import (
-    acquireLock,
-    cmd_run,
-    ensure_directory,
-    ensure_remove_file,
-    get_file_size,
-    has_handle,
-    hash_sha1,
-    load_from_yaml_file,
-    parse_iteration,
-    query_all_opened_file_in_system,
-    releaseLock,
-    write_to_file,
-)
-
-try:
-    from azure.storage.common.storageclient import logger
-    logger.propagate = False
-except Exception as e:
-    print(f"Cannot import azure.storage {e}")
-
-
-def create_cloud_storage(x=None, config_file=None, config=None):
-    if config is not None:
-        return CloudStorage(config)
-    if config_file is None:
-        config_file = './aux_data/configs/{}blob_account.yaml'.format(x)
-    config = load_from_yaml_file(config_file)
-    c = CloudStorage(config)
-    return c
-
-
-def azcopy_upload(src, dest_url, dest_key):
-    cmd = [
-        get_azcopy(), '--source', src, '--destination', dest_url,
-        '--exclude-older', '--dest-key', "{}".format(dest_key), '--quiet',
-        '--parallel-level', '32'
-    ]
-    resume_file = '/tmp/azure.' + hash_sha1([src, dest_url]) + '.jnl'
-    cmd.append('--resume')
-    cmd.append(resume_file)
-    if op.isdir(src):
-        cmd.append('--recursive')
-    cmd_run(cmd, shell=True)
-
-
-def blob_upload_qdoutput(src_path, dest_path, client):
-    to_copy = get_to_copy_file_for_qdoutput(src_path, dest_path)
-    for f, d in to_copy:
-        client.az_upload2(f, d)
-
-
-def get_to_copy_file_for_qdoutput(src_path, dest_path):
-    # upload all the files under src_path
-    to_copy = []
-    all_src_file = glob.glob(op.join(src_path, '*'))
-    exclude_suffix = ['txt', 'zip']
-    for f in all_src_file:
-        if op.isfile(f) and not any(f.endswith(s) for s in exclude_suffix):
-            to_copy.append((f, op.join(dest_path, op.basename(f))))
-
-    # for the model and the tested files, only upload the best
-    all_src_file = glob.glob(
-        op.join(src_path, 'snapshot', 'model_iter_*.caffemodel'))
-    all_src_file.extend(
-        glob.glob(op.join(src_path, 'snapshot', 'model_iter_*.pt')))
-    all_iter = [parse_iteration(f) for f in all_src_file]
-    max_iters = max(all_iter)
-    need_copy_files = [
-        f for f, i in zip(all_src_file, all_iter) if i == max_iters
-    ]
-    dest_snapshot = op.join(dest_path, 'snapshot')
-    for f in need_copy_files:
-        to_copy.append((f, op.join(dest_snapshot, op.basename(f))))
-        if f.endswith('.caffemodel'):
-            f = f.replace('.caffemodel', '.solverstate')
-            to_copy.append((f, op.join(dest_snapshot, op.basename(f))))
-    return to_copy
-
-
-def blob_upload(src, dst, c=None):
-    if c is None:
-        c = create_cloud_storage('vig')
-    c.az_upload2(src, dst)
-
-
-def get_root_all_full_expid(full_expid_prefix, all_blob_name):
-    # full_expid_prefix can be the folder of full_expid; or with some prefix so
-    # that we can filter
-    if all(b.startswith(full_expid_prefix + '/') for b in all_blob_name):
-        root = full_expid_prefix
-    else:
-        root = op.dirname(full_expid_prefix)
-    all_full_expid = set(b[len(root) + 1:].split('/')[0] for b in all_blob_name
-                         if b.startswith(root))
-    return root, all_full_expid
-
-
-def get_azcopy():
-    # this is v10
-    azcopy = op.expanduser('~/code/azcopy/azcopy')
-    if not op.isfile(azcopy):
-        azcopy = 'azcopy'
-    return azcopy
-
-
-def get_leaf_names(all_fname):
-    # build the tree first
-    from ete3 import Tree
-    root = Tree()
-    for fname in all_fname:
-        components = fname.split('/')
-        curr = root
-        for com in components:
-            currs = [c for c in curr.children if c.name == com]
-            if len(currs) == 0:
-                curr = curr.add_child(name=com)
-            else:
-                assert len(currs) == 1
-                curr = currs[0]
-    result = []
-    for node in root.iter_leaves():
-        ans = [s.name for s in node.get_ancestors()[:-1]]
-        ans.insert(0, node.name)
-        result.append('/'.join([a for a in ans[::-1]]))
-    return result
-
-
-def blob_download_qdoutput(src_path, target_folder):
-    c = create_cloud_storage('vig')
-    c.blob_download_qdoutput(src_path, target_folder)
-
-
-def create_cloud_fuse(config=None):
-    if config is None:
-        if 'QD_CLOUD_FUSE_CONFIG_FILE' in os.environ:
-            fname = os.environ['QD_CLOUD_FUSE_CONFIG_FILE']
-        else:
-            fname = 'aux_data/configs/cloud_fuse.yaml'
-        config = load_from_yaml_file(fname)
-    local_to_remote_cache = config['local_to_remote_cache']
-    cloud_account = config['storage_account']
-    cloud = create_cloud_storage(cloud_account)
-    return CloudFuse(local_to_remote_cache, cloud)
-
-
-def garbage_collection_for_cloud_fuse_loop(local_folders, total_size_limit):
-    while True:
-        try:
-            garbage_collection_for_cloud_fuse(local_folders, total_size_limit)
-        except:
-            from .qd_common import print_trace
-            print_trace()
-        time.sleep(10)
-
-
-def garbage_collection_for_cloud_fuse(local_folders, total_size_limit):
-    infos = []
-    for local_folder in local_folders:
-        for root, _, files in os.walk(local_folder):
-            for f in files:
-                if f.startswith('.') or f.endswith('.tmp'):
-                    continue
-                f = op.join(root, f)
-                if op.isfile(f):
-                    info = {
-                        'fname': f,
-                        'size_in_bytes': get_file_size(f),
-                    }
-                    infos.append(info)
-    total = sum([i['size_in_bytes'] for i in infos])
-    #logging.info('scanned {} files'.format(len(infos)))
-    if total < total_size_limit:
-        #logging.info('{}G < {}G. No need to delete files'.format(
-        #total / 1024 ** 3, total_size_limit / 1024 ** 3))
-        return
-    # we need to delete
-    need_to_delete = total - total_size_limit
-    opened = set(query_all_opened_file_in_system())
-    logging.info('queried {} opened files in system'.format(len(opened)))
-    for i in infos:
-        i['has_handle'] = has_handle(i['fname'], opened)
-        i['last_access_time'] = op.getatime(i['fname'])
-    infos = sorted(infos,
-                   key=lambda i: (i['has_handle'], i['last_access_time']))
-    deleted = []
-    for i in infos:
-        ensure_remove_file(i['fname'])
-        need_to_delete -= i['size_in_bytes']
-        deleted.append(i['fname'])
-        if need_to_delete < 0:
-            break
-    logging.info('deleted {}'.format(deleted))
-
-
-class CloudFuse(object):
-
-    def __init__(self, local_to_remote_cache, cloud):
-        self.local_to_remote_cache = local_to_remote_cache
-        self.local_to_remote_cache = dict([
-            (op.abspath(l), x) for l, x in self.local_to_remote_cache.items()
-        ])
-        self.cloud = cloud
-        self.invoked_collector = False
-
-    def isfile(self, fname):
-        remote_dir, cache_dir, sub_name = self.get_remote_cache(fname)
-        remote_file = op.join(remote_dir, sub_name)
-        if remote_file is None:
-            return op.isfile(fname)
-        return self.cloud.file_exists(remote_file)
-
-    def get_file_size(self, fname):
-        remote_dir, cache_dir, sub_name = self.get_remote_cache(fname)
-        if sub_name is None:
-            return get_file_size(fname)
-        remote_file = op.join(remote_dir, sub_name)
-        return self.cloud.query_info(remote_file)['size_in_bytes']
-
-    def ensure_cache(self, fname_or_fs):
-        if isinstance(fname_or_fs, str):
-            fname = fname_or_fs
-            remote_dir, cache_dir, sub_name = self.get_remote_cache(fname)
-            remote_file = op.join(remote_dir, sub_name)
-            cache_file = op.join(cache_dir, sub_name)
-            if remote_file is None:
-                return
-            self.ensure_remote_to_cache(remote_file, cache_file)
-        elif isinstance(fname_or_fs, (tuple, list)) and len(fname_or_fs) > 0:
-            # for multiple files, we can download them at the same time rather
-            # than one by one
-            fnames = fname_or_fs
-            remote_cache_infos = [self.get_remote_cache(f) for f in fnames]
-            remote_cache_infos = [(rd, cd, sn)
-                                  for rd, cd, sn in remote_cache_infos
-                                  if not op.isfile(op.join(cd, sn))]
-            if len(remote_cache_infos) > 0:
-                lock_fd = self.acquireLock()
-                # we need to check again
-                remote_cache_infos = [self.get_remote_cache(f) for f in fnames]
-                remote_cache_infos = [(rd, cd, sn)
-                                      for rd, cd, sn in remote_cache_infos
-                                      if not op.isfile(op.join(cd, sn))]
-                remote_cache_infos = [((rd, cd), sn)
-                                      for rd, cd, sn in remote_cache_infos]
-                from .qd_common import list_to_dict
-                rd_cd_to_sn = list_to_dict(remote_cache_infos, 0)
-                for (rd, cd), sns in rd_cd_to_sn.items():
-                    file_list = '/tmp/{}'.format(hash_sha1(pformat(sns)))
-                    write_to_file('\n'.join(sns), file_list)
-                    self.cloud.az_download(
-                        rd,
-                        cd,
-                        is_folder=False,
-                        file_list=file_list,
-                        tmp_first=False,
-                        sync=False,
-                        retry=10,
-                    )
-                    for s in sns:
-                        assert op.isfile(op.join(cd, s)), op.join(cd, s)
-                releaseLock(lock_fd)
-
-    def open(self, fname, mode):
-        assert mode in ['r', 'rb'], 'no support for writing'
-        remote_dir, cache_dir, sub_name = self.get_remote_cache(fname)
-        if remote_dir is None:
-            return open(fname, mode)
-        remote_file = op.join(remote_dir, sub_name)
-        cache_file = op.join(cache_dir, sub_name)
-        self.ensure_remote_to_cache(remote_file, cache_file)
-        return open(cache_file, mode)
-
-    def ensure_del_cache(self, fname):
-        remote_dir, cache_dir, sub_name = self.get_remote_cache(fname)
-        if remote_dir is None:
-            return
-        cache_file = op.join(cache_dir, sub_name)
-        #if has_handle(cache_file):
-        #logging.warning('{} is used by some process'.format(
-        #cache_file))
-        #return False
-        ensure_remove_file(cache_file)
-        #return True
-
-    def ensure_invoke_garbage_collect(self):
-        # if the current occupied files take more than 0.5. Then, remove the
-        # oldest files
-        if self.invoked_collector:
-            return
-        self.invoked_collector = True
-        local_caches = [c for r, c in self.local_to_remote_cache.values()]
-        local_caches = list(set(local_caches))
-        import threading
-
-        # 200 GB
-        limit = 200 * 1024 * 1024 * 1024
-        t = threading.Thread(
-            target=garbage_collection_for_cloud_fuse_loop,
-            args=(local_caches, limit),
-            daemon=True,
-        )
-        t.start()
-        self.garbage_collector = t
-
-    #--------------------------------------------------------------
-    def acquireLock(self, x=None):
-        # all files share one lock
-        lock_fd = acquireLock(op.join('/tmp', 'lock_azcopy_fuse'))
-        return lock_fd
-
-    def get_remote_cache(self, fname):
-        fname = op.abspath(fname)
-        local_folders = [
-            l for l in self.local_to_remote_cache if fname.startswith(l + '/')
-        ]
-        if len(local_folders) == 0:
-            return None, None, None
-        local_folder = local_folders[0]
-        remote_folder, cache_folder = self.local_to_remote_cache[local_folder]
-        sub_name = fname[len(local_folder) + 1:]
-        d = op.dirname(sub_name)
-        sub = op.basename(sub_name)
-        return op.join(remote_folder, d), op.join(cache_folder, d), sub
-
-    def remote_to_cache(self, remote_file, cache_file):
-        if op.isfile(cache_file):
-            return
-        lock_fd = self.acquireLock()
-        if not op.isfile(cache_file):
-            self.cloud.az_download(remote_file,
-                                   cache_file,
-                                   sync=False,
-                                   is_folder=False)
-            # the following is slower
-            #self.cloud.download_to_path(
-            #remote_file, cache_file, max_connections=10)
-        releaseLock(lock_fd)
-
-    def ensure_remote_to_cache(self, remote_file, cache_file):
-        if not op.isfile(cache_file):
-            self.remote_to_cache(remote_file, cache_file)
-
-
-class CloudStorage(object):
-
-    def __init__(self, config=None):
-        if config is None:
-            config_file = 'aux_data/configs/azure_blob_account.yaml'
-            config = load_from_yaml_file(config_file)
-        account_name = config['account_name']
-        account_key = config.get('account_key')
-        self.sas_token = config.get('sas_token')
-        self.container_name = config['container_name']
-        self.account_name = account_name
-        self.account_key = account_key
-        self._block_blob_service = None
-
-    def __repr__(self):
-        return 'CloudStorage(account={}, container={})'.format(
-            self.account_name,
-            self.container_name,
-        )
-
-    @property
-    def block_blob_service(self):
-        if self._block_blob_service is None:
-            from azure.storage.blob import BlockBlobService
-            self._block_blob_service = BlockBlobService(
-                account_name=self.account_name,
-                account_key=self.account_key,
-                sas_token=self.sas_token)
-        return self._block_blob_service
-
-    def list_blob_names(self, prefix=None, creation_time_larger_than=None):
-        if creation_time_larger_than is not None:
-            creation_time_larger_than = creation_time_larger_than.timestamp()
-            return (b.name for b in self.block_blob_service.list_blobs(
-                self.container_name, prefix=prefix)
-                    if b.properties.creation_time.timestamp() >
-                    creation_time_larger_than)
-        else:
-            return self.block_blob_service.list_blob_names(self.container_name,
-                                                           prefix=prefix)
-
-    def rm_prefix(self, prefix):
-        all_path = self.list_blob_names(prefix)
-        for p in all_path:
-            logging.info('deleting {}'.format(p))
-            self.rm(p)
-
-    def rm(self, path):
-        self.block_blob_service.delete_blob(self.container_name, path)
-
-    def iter_blob_info(self, prefix=None, creation_time_larger_than=None):
-
-        def valid(b):
-            c1 = creation_time_larger_than is None or b.properties.creation_time.timestamp(
-            ) > creation_time_larger_than.timestamp()
-            c2 = b.name.startswith(prefix)
-            return c1 and c2
-
-        for b in self.block_blob_service.list_blobs(self.container_name,
-                                                    prefix=prefix):
-            if valid(b):
-                yield {
-                    'name': b.name,
-                    'size_in_bytes': b.properties.content_length,
-                    'creation_time': b.properties.creation_time,
-                }
-
-    def list_blob_info(self, prefix=None, creation_time_larger_than=None):
-        return list(self.iter_blob_info(prefix, creation_time_larger_than))
-
-    def get_url(self, blob_name):
-        from azure.storage.blob.models import BlobPermissions
-        permission = BlobPermissions(read=True)
-        import datetime
-        expiry = datetime.datetime.now() + datetime.timedelta(days=30)
-        sas = self.block_blob_service.generate_blob_shared_access_signature(
-            self.container_name,
-            blob_name,
-            permission=permission,
-            expiry=expiry,
-        )
-        url = self.block_blob_service.make_blob_url(self.container_name,
-                                                    blob_name)
-        return '{}?{}'.format(url, sas)
-
-    def upload_stream(self, s, name, force=False):
-        if not force and self.block_blob_service.exists(
-                self.container_name, name):
-            return self.block_blob_service.make_blob_url(
-                self.container_name, name)
-        else:
-            if sys.version_info.major == 3 and type(s) is bytes:
-                self.block_blob_service.create_blob_from_bytes(
-                    self.container_name, name, s)
-            else:
-                self.block_blob_service.create_blob_from_stream(
-                    self.container_name, name, s)
-            return self.block_blob_service.make_blob_url(
-                self.container_name, name)
-
-    def upload_folder(self, folder, target_prefix):
-
-        def remove_tailing(x):
-            if x.endswith('/') or x.endswith('\\'):
-                x = x[:-1]
-            return x
-
-        folder = remove_tailing(folder)
-        target_prefix = remove_tailing(target_prefix)
-        for root, dirs, files in os.walk(folder):
-            for f in files:
-                src_file = op.join(root, f)
-                assert src_file.startswith(folder)
-                target_file = src_file.replace(folder, target_prefix)
-                self.upload_file(src_file, target_file)
-            for d in dirs:
-                self.upload_folder(op.join(root, d), op.join(target_prefix, d))
-
-    def upload_file(self, src_file, target_file):
-        logging.info('uploading {} to {}'.format(src_file, target_file))
-        #import time
-        #start_time = time.time()
-        bar = [None]
-        last = [0]
-
-        def upload_callback(curr, total):
-            if total < 1024**3:
-                return
-            if bar[0] is None:
-                bar[0] = tqdm(total=total, unit_scale=True)
-            bar[0].update(curr - last[0])
-            last[0] = curr
-
-        if target_file.startswith('/'):
-            logging.info('remove strarting slash for {}'.format(target_file))
-            target_file = target_file[1:]
-        self.block_blob_service.create_blob_from_path(
-            self.container_name,
-            target_file,
-            src_file,
-            max_connections=8,
-            progress_callback=upload_callback)
-
-    def az_upload(self, src_dir, dest_dir):
-        # this is using the old version of azcopy. prefer to use az_upload2
-        dest_url = op.join(
-            'https://{}.blob.core.windows.net'.format(self.account_name),
-            self.container_name, dest_dir)
-        if self.account_key:
-            azcopy_upload(src_dir, dest_url, self.account_key)
-        else:
-            raise Exception
-
-    @deprecated('use upload()')
-    def az_sync(self, src_dir, dest_dir):
-        self.upload(src_dir, dest_dir)
-
-    def upload(self, src_dir, dest_dir, from_blob=None):
-        if from_blob is None:
-            self.upload_from_local(src_dir, dest_dir)
-        else:
-            self.upload_from_another(src_dir, dest_dir, from_blob)
-
-    def upload_from_another(self, src_dir, dest_dir, from_blob):
-        assert self.sas_token
-        cmd = []
-        cmd.append(get_azcopy())
-        if from_blob.dir_exists(src_dir) and not self.dir_exists(dest_dir):
-            # in this case, azcopy will copy the local folder under the
-            # destination folder, and thus we have to use the folder of
-            # dest_dir as the dest_dir.
-            assert op.basename(src_dir) == op.basename(dest_dir)
-            dest_dir = op.dirname(dest_dir)
-            cmd.append('cp')
-        else:
-            if self.exists(dest_dir):
-                cmd.append('sync')
-            else:
-                cmd.append('cp')
-        url = 'https://{}.blob.core.windows.net'.format(from_blob.account_name)
-        url = op.join(url, from_blob.container_name, src_dir)
-        assert self.sas_token.startswith('?')
-        from_url = url
-        url = url + from_blob.sas_token
-        cmd.append(url)
-
-        url = 'https://{}.blob.core.windows.net'.format(self.account_name)
-        if dest_dir.startswith('/'):
-            dest_dir = dest_dir[1:]
-        url = op.join(url, self.container_name, dest_dir)
-        assert self.sas_token.startswith('?')
-        data_url = url
-        url = url + self.sas_token
-        cmd.append(url)
-
-        if from_blob.dir_exists(src_dir):
-            cmd.append('--recursive')
-        if from_url == data_url:
-            logging.info('no need to sync data as url is exactly the same')
-            return data_url, url
-        cmd_run(cmd)
-        return data_url, url
-
-    def upload_from_local(self, src_dir, dest_dir):
-        assert self.sas_token
-        cmd = []
-        cmd.append(get_azcopy())
-        if op.isdir(src_dir) and not self.dir_exists(dest_dir):
-            # in this case, azcopy will copy the local folder under the
-            # destination folder, and thus we have to use the folder of
-            # dest_dir as the dest_dir.
-            assert op.basename(src_dir) == op.basename(dest_dir)
-            dest_dir = op.dirname(dest_dir)
-            cmd.append('cp')
-        else:
-            if self.exists(dest_dir):
-                cmd.append('sync')
-            else:
-                cmd.append('cp')
-        cmd.append(op.realpath(src_dir))
-        url = 'https://{}.blob.core.windows.net'.format(self.account_name)
-        if dest_dir.startswith('/'):
-            dest_dir = dest_dir[1:]
-        url = op.join(url, self.container_name, dest_dir)
-        assert self.sas_token.startswith('?')
-        data_url = url
-        url = url + self.sas_token
-        cmd.append(url)
-        if op.isdir(src_dir):
-            cmd.append('--recursive')
-        cmd_run(cmd)
-        return data_url, url
-
-    @deprecated('use upload')
-    def az_upload2(self, src_dir, dest_dir, sync=False):
-        return self.upload(src_dir, dest_dir)
-        #assert self.sas_token
-        #cmd = []
-        #cmd.append(get_azcopy())
-        #if sync:
-        #cmd.append('sync')
-        #else:
-        #cmd.append('cp')
-        #cmd.append(op.realpath(src_dir))
-        #url = 'https://{}.blob.core.windows.net'.format(self.account_name)
-        #if dest_dir.startswith('/'):
-        #dest_dir = dest_dir[1:]
-        #url = op.join(url, self.container_name, dest_dir)
-        #assert self.sas_token.startswith('?')
-        #data_url = url
-        #url = url + self.sas_token
-        #cmd.append(url)
-        #if op.isdir(src_dir):
-        #cmd.append('--recursive')
-        #cmd_run(cmd)
-        #return data_url, url
-
-    def query_info(self, path):
-        p = self.block_blob_service.get_blob_properties(
-            self.container_name, path)
-        result = {
-            'size_in_bytes': p.properties.content_length,
-            'creation_time': p.properties.creation_time,
-        }
-        return result
-
-    def az_download_all(self, remote_path, local_path):
-        all_blob_name = list(self.list_blob_names(remote_path))
-        all_blob_name = get_leaf_names(all_blob_name)
-        for blob_name in all_blob_name:
-            target_file = blob_name.replace(remote_path, local_path)
-            if not op.isfile(target_file):
-                self.az_download(blob_name, target_file, sync=False)
-
-    def is_folder(self, remote_path):
-        is_folder = False
-        for x in self.list_blob_names(remote_path + '/'):
-            is_folder = True
-            break
-        return is_folder
-
-    def az_download_each(self, remote_path, local_path):
-        # if it is a folder, we will download each file individually
-        if remote_path.startswith('/'):
-            remote_path = remote_path[1:]
-        if remote_path.endswith('/'):
-            remote_path = remote_path[:-1]
-        is_folder = self.is_folder(remote_path)
-        if is_folder:
-            all_remote_file = self.list_blob_names(remote_path + '/')
-            all_local_file = [
-                op.join(local_path, r[len(remote_path) + 1:])
-                for r in all_remote_file
-            ]
-        else:
-            all_remote_file = [remote_path]
-            all_local_file = [local_path]
-        for r, l in zip(all_remote_file, all_local_file):
-            self.az_download(r, l, sync=True)
-
-    def az_download(
-        self,
-        remote_path,
-        local_path,
-        sync=True,
-        is_folder=None,
-        tmp_first=True,
-        file_list=None,
-        retry=1,
-    ):
-        from .qd_common import limited_retry_agent
-        limited_retry_agent(retry, self.az_download_once, remote_path,
-                            local_path, sync, is_folder, tmp_first, file_list)
-
-    def az_download_once(
-        self,
-        remote_path,
-        local_path,
-        sync=True,
-        is_folder=None,
-        tmp_first=True,
-        file_list=None,
-    ):
-        if remote_path.startswith('/'):
-            remote_path = remote_path[1:]
-        if remote_path.endswith('/'):
-            remote_path = remote_path[:-1]
-        if is_folder is None:
-            is_folder = self.is_folder(remote_path)
-        if sync and tmp_first:
-            logging.info('if sync, no need to save to temp first')
-            tmp_first = False
-        if is_folder:
-            if not sync and op.isdir(local_path) and tmp_first:
-                if len(os.listdir(local_path)) > 0:
-                    logging.error('ignore to download from {} to {}'
-                                  ' since destination is not empty'.format(
-                                      remote_path,
-                                      local_path,
-                                  ))
-                    return
-                from .qd_common import ensure_remove_dir
-                ensure_remove_dir(local_path)
-        else:
-            if sync:
-                if tmp_first:
-                    sync = False
-                elif not op.isfile(local_path):
-                    sync = False
-        ensure_directory(op.dirname(local_path))
-        origin_local_path = local_path
-        if tmp_first:
-            local_path = local_path + '.tmp'
-        ensure_directory(op.dirname(local_path))
-        assert self.sas_token
-        cmd = []
-        cmd.append(get_azcopy())
-        if sync:
-            cmd.append('sync')
-        else:
-            cmd.append('cp')
-        url = 'https://{}.blob.core.windows.net'.format(self.account_name)
-        url = '/'.join([url, self.container_name, remote_path])
-        assert self.sas_token.startswith('?')
-        data_url = url
-        url = url + self.sas_token
-        cmd.append(url)
-        if file_list:
-            # requirements from file_list
-            assert op.basename(local_path) == op.basename(remote_path)
-            cmd.append(op.dirname(op.realpath(local_path)))
-        else:
-            cmd.append(op.realpath(local_path))
-        if is_folder:
-            cmd.append('--recursive')
-            if sync:
-                # azcopy's requirement
-                ensure_directory(local_path)
-        else:
-            ensure_directory(op.dirname(local_path))
-        if file_list:
-            cmd.append('--list-of-files')
-            cmd.append(file_list)
-        cmd_run(cmd)
-        if tmp_first:
-            os.rename(local_path, origin_local_path)
-        return data_url, url
-
-    def download_to_path(self, blob_name, local_path, max_connections=2):
-        dir_path = op.dirname(local_path)
-        if op.isfile(dir_path) and get_file_size(dir_path) == 0:
-            os.remove(dir_path)
-        ensure_directory(dir_path)
-        tmp_local_path = local_path + '.tmp'
-        pbar = {}
-
-        def progress_callback(curr, total):
-            if len(pbar) == 0:
-                pbar['tqdm'] = tqdm(total=total, unit_scale=True)
-                pbar['last'] = 0
-                pbar['count'] = 0
-            pbar['count'] += 1
-            if pbar['count'] > 100:
-                pbar['tqdm'].update(curr - pbar['last'])
-                pbar['last'] = curr
-                pbar['count'] = 0
-
-        self.block_blob_service.get_blob_to_path(
-            self.container_name,
-            blob_name,
-            tmp_local_path,
-            progress_callback=progress_callback,
-            max_connections=max_connections)
-        os.rename(tmp_local_path, local_path)
-
-    def download_to_stream(self, blob_name, s, max_connections=2):
-        pbar = {}
-
-        def progress_callback(curr, total):
-            if len(pbar) == 0:
-                pbar['tqdm'] = tqdm(total=total, unit_scale=True)
-                pbar['last'] = 0
-                pbar['count'] = 0
-            pbar['count'] += 1
-            if pbar['count'] > 1:
-                pbar['tqdm'].update(curr - pbar['last'])
-                pbar['last'] = curr
-                pbar['count'] = 0
-
-        self.block_blob_service.get_blob_to_stream(
-            self.container_name,
-            blob_name,
-            s,
-            max_connections=max_connections,
-            progress_callback=progress_callback,
-        )
-
-    def exists(self, path):
-        return self.file_exists(path) or self.dir_exists(path)
-
-    def file_exists(self, path):
-        fp = acquireLock('/tmp/{}.lock'.format(hash_sha1(path)))
-        result = self.block_blob_service.exists(self.container_name, path)
-        releaseLock(fp)
-        return result
-
-    def dir_exists(self, dir_path):
-        dir_path = op.normpath(dir_path)
-        for x in self.list_blob_names(prefix=dir_path + '/'):
-            return True
-        return False
-
-    def blob_download_qdoutput(
-        self,
-        src_path,
-        target_folder,
-        latest_only=True,
-        creation_time_larger_than=None,
-        too_large_limit_in_gb=None,
-        ignore_base_fname_patterns=None,
-        dry_run=False,
-    ):
-
-        def is_in_snapshot(b):
-            parts = list(b.split('/'))
-            return 'snapshot' in parts and parts.index(
-                'snapshot') < len(parts) - 1
-
-        all_blob_name = list(
-            self.list_blob_names(src_path, creation_time_larger_than))
-        # remove snapshot/model_iter/abc.bin
-        clean = []
-        for b in all_blob_name:
-            parts = list(b.split('/'))
-            if 'snapshot' in parts and \
-                    parts.index('snapshot') < len(parts) - 2:
-                continue
-            else:
-                clean.append(b)
-        all_blob_name = clean
-
-        all_blob_name = get_leaf_names(all_blob_name)
-        in_snapshot_blobs = [b for b in all_blob_name if is_in_snapshot(b)]
-        not_in_snapshot_blobs = [
-            b for b in all_blob_name if not is_in_snapshot(b)
-        ]
-        try:
-            not_in_snapshot_blobs.remove(src_path)
-        except:
-            pass
-        try:
-            not_in_snapshot_blobs.remove(src_path + '/snapshot')
-        except:
-            pass
-        need_download_blobs = []
-        need_download_blobs.extend(not_in_snapshot_blobs)
-        iters = [parse_iteration(f) for f in in_snapshot_blobs]
-        if len(iters) > 0 and latest_only:
-            max_iters = max(iters)
-            need_download_blobs.extend([
-                f for f, i in zip(in_snapshot_blobs, iters)
-                if i == max_iters or f.endswith('.report')
-            ])
-        need_download_blobs.extend(
-            [f for f, i in zip(in_snapshot_blobs, iters) if i == -2])
-        to_remove = []
-        for i, b1 in enumerate(need_download_blobs):
-            for b2 in need_download_blobs:
-                if b1 != b2 and b2.startswith(b1) and b2.startswith(b1 + '/'):
-                    to_remove.append(b1)
-                    break
-        for t in to_remove:
-            need_download_blobs.remove(t)
-        need_download_blobs = [
-            t for t in need_download_blobs if not t.endswith('.tmp')
-        ]
-        f_target_f = [(f, f.replace(src_path, target_folder))
-                      for f in need_download_blobs]
-        f_target_f = [(f, target_f) for f, target_f in f_target_f
-                      if not op.isfile(target_f)]
-        f_target_f = [(f, target_f) for f, target_f in f_target_f
-                      if len(f) > 0]
-
-        if ignore_base_fname_patterns is not None:
-            import re
-            result = []
-            for f, target_f in f_target_f:
-                if any(
-                        re.match(p, f) is not None
-                        for p in ignore_base_fname_patterns):
-                    logging.info(
-                        'ignore {} due to reg pattern matching'.format(f))
-                else:
-                    result.append((f, target_f))
-            f_target_f = result
-
-        if too_large_limit_in_gb is not None:
-            logging.info('before size filtering = {}'.format(len(f_target_f)))
-            f_target_f = [(f, target_f) for f, target_f in f_target_f
-                          if self.query_info(f)['size_in_bytes'] /
-                          1024.**3 <= too_large_limit_in_gb]
-            logging.info('after size filtering = {}'.format(len(f_target_f)))
-        for f, target_f in tqdm(f_target_f):
-            logging.info('download {} to {}'.format(f, target_f))
-            try:
-                if not dry_run:
-                    self.az_download(f, target_f)
-            except:
-                pass
-                #self.download_to_path(f, target_f)
-
-
-if __name__ == '__main__':
-    from .qd_common import init_logging, parse_general_args
-    init_logging()
-    kwargs = parse_general_args()
-    logging.info('param:\n{}'.format(pformat(kwargs)))
-    function_name = kwargs['type']
-    del kwargs['type']
-    locals()[function_name](**kwargs)
diff --git a/AVLFormer/src/utils/comm.py b/AVLFormer/src/utils/comm.py
deleted file mode 100644
index 1d1d932..0000000
--- a/AVLFormer/src/utils/comm.py
+++ /dev/null
@@ -1,235 +0,0 @@
-"""
-This file contains primitives for multi-gpu communication.
-This is useful when doing distributed training.
-"""
-
-import os
-import pickle
-
-import torch
-import torch.distributed as dist
-
-from .logger import LOGGER
-
-
-def dist_init(args):
-    ngpus_per_node = torch.cuda.device_count()
-
-    local_rank = int(os.environ['SLURM_LOCALID'])
-    gpu = local_rank
-    port = str(13824)
-    proc_id = int(os.environ['SLURM_PROCID'])
-    ntasks = int(os.environ['SLURM_NTASKS'])
-    node_list = os.environ['SLURM_NODELIST']
-    if '[' in node_list:
-        beg = node_list.find('[')
-        pos1 = node_list.find('-', beg)
-        if pos1 < 0:
-            pos1 = 1000
-        pos2 = node_list.find(',', beg)
-        if pos2 < 0:
-            pos2 = 1000
-        node_list = node_list[:min(pos1, pos2)].replace('[', '')
-    addr = node_list[8:].replace('-', '.')
-    os.environ['MASTER_PORT'] = port
-    os.environ['MASTER_ADDR'] = addr
-    os.environ['WORLD_SIZE'] = str(ntasks)
-    os.environ['RANK'] = str(proc_id)
-    os.environ['LOCAL_RANK'] = str(local_rank)
-    rank = int(os.environ["RANK"])
-    world_size = int(os.environ['WORLD_SIZE'])
-    dist_url = 'tcp://' + addr + ':' + port
-
-    args.num_gpus = ntasks
-    args.distributed = ntasks > 1
-    args.local_rank = str(local_rank)
-
-    torch.cuda.set_device(gpu)
-
-    print('distributed init (rank {}): {}, gpu {} | ngpu_node {}'.format(
-        rank, dist_url, gpu, ngpus_per_node),
-          flush=True)
-    dist.init_process_group(backend='nccl',
-                            init_method=dist_url,
-                            world_size=world_size,
-                            rank=rank)
-    torch.distributed.barrier()
-    # setup_for_distributed(rank == 0)
-
-
-def get_world_size():
-    if 'WORLD_SIZE' in os.environ:
-        return int(os.environ['WORLD_SIZE'])
-    return int(os.environ.get('OMPI_COMM_WORLD_SIZE', '1'))
-    # if not dist.is_available():
-    #     return 1
-    # if not dist.is_initialized():
-    #     return 1
-    # return dist.get_world_size()
-
-
-def get_rank():
-    if 'RANK' in os.environ:
-        return int(os.environ['RANK'])
-    return int(os.environ.get('OMPI_COMM_WORLD_RANK', '0'))
-    # if not dist.is_available():
-    #     return 0
-    # if not dist.is_initialized():
-    #     return 0
-    # return dist.get_rank()
-
-
-def get_local_rank():
-    if 'LOCAL_RANK' in os.environ:
-        return int(os.environ['LOCAL_RANK'])
-    return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', '0'))
-
-
-def get_local_size():
-    if 'LOCAL_SIZE' in os.environ:
-        return int(os.environ['LOCAL_SIZE'])
-    return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_SIZE', '1'))
-
-
-def is_main_process():
-    return get_rank() == 0
-
-
-def synchronize():
-    """
-    Helper function to synchronize (barrier) among all processes when
-    using distributed training
-    """
-    if not dist.is_available():
-        return
-    if not dist.is_initialized():
-        return
-    world_size = dist.get_world_size()
-    if world_size == 1:
-        return
-    dist.barrier()
-
-
-def gather_on_master(data):
-    """Same as all_gather, but gathers data on master process only, using CPU.
-    Thus, this does not work with NCCL backend unless they add CPU support.
-
-    The memory consumption of this function is ~ 3x of data size. While in
-    principal, it should be ~2x, it's not easy to force Python to release
-    memory immediately and thus, peak memory usage could be up to 3x.
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-
-    # serialized to a Tensor
-    buffer = pickle.dumps(data)
-    # trying to optimize memory, but in fact, it's not guaranteed to be released
-    del data
-    storage = torch.ByteStorage.from_buffer(buffer)
-    del buffer
-    tensor = torch.ByteTensor(storage)
-
-    # obtain Tensor size of each rank
-    local_size = torch.LongTensor([tensor.numel()])
-    size_list = [torch.LongTensor([0]) for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-
-    if local_size != max_size:
-        padding = torch.ByteTensor(size=(max_size - local_size, ))
-        tensor = torch.cat((tensor, padding), dim=0)
-        del padding
-
-    if is_main_process():
-        tensor_list = []
-        for _ in size_list:
-            tensor_list.append(torch.ByteTensor(size=(max_size, )))
-        dist.gather(tensor, gather_list=tensor_list, dst=0)
-        del tensor
-    else:
-        dist.gather(tensor, gather_list=[], dst=0)
-        del tensor
-        return
-
-    data_list = []
-    for tensor in tensor_list:
-        buffer = tensor.cpu().numpy().tobytes()
-        del tensor
-        data_list.append(pickle.loads(buffer))
-        del buffer
-
-    return data_list
-
-
-def all_gather(data):
-    """
-    Run all_gather on arbitrary picklable data (not necessarily tensors)
-    Args:
-        data: any picklable object
-    Returns:
-        list[data]: list of data gathered from each rank
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-
-    # serialized to a Tensor
-    buffer = pickle.dumps(data)
-    storage = torch.ByteStorage.from_buffer(buffer)
-    tensor = torch.ByteTensor(storage).to("cuda")
-
-    # obtain Tensor size of each rank
-    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
-    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-
-    # receiving Tensor from all ranks
-    # we pad the tensor because torch all_gather does not support
-    # gathering tensors of different shapes
-    tensor_list = []
-    for _ in size_list:
-        tensor_list.append(torch.ByteTensor(size=(max_size, )).to("cuda"))
-    if local_size != max_size:
-        padding = torch.ByteTensor(size=(max_size - local_size, )).to("cuda")
-        tensor = torch.cat((tensor, padding), dim=0)
-    dist.all_gather(tensor_list, tensor)
-
-    data_list = []
-    for size, tensor in zip(size_list, tensor_list):
-        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer))
-
-    return data_list
-
-
-def reduce_dict(input_dict, average=True):
-    """
-    Args:
-        input_dict (dict): all the values will be reduced
-        average (bool): whether to do average or sum
-    Reduce the values in the dictionary from all processes so that process with rank
-    0 has the averaged results. Returns a dict with the same fields as
-    input_dict, after reduction.
-    """
-    world_size = get_world_size()
-    if world_size < 2:
-        return input_dict
-    with torch.no_grad():
-        names = []
-        values = []
-        # sort the keys so that they are consistent across processes
-        for k in sorted(input_dict.keys()):
-            names.append(k)
-            values.append(input_dict[k])
-        values = torch.stack(values, dim=0)
-        dist.reduce(values, dst=0)
-        if dist.get_rank() == 0 and average:
-            # only main process gets accumulated, so only divide by
-            # world_size in this case
-            values /= world_size
-        reduced_dict = {k: v for k, v in zip(names, values)}
-    return reduced_dict
diff --git a/AVLFormer/src/utils/deepspeed.py b/AVLFormer/src/utils/deepspeed.py
deleted file mode 100644
index 803cc73..0000000
--- a/AVLFormer/src/utils/deepspeed.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from pprint import pformat
-
-import torch
-
-from .logger import LOGGER as logger
-
-
-def get_deepspeed_config(args):
-    config_params = {
-        'train_batch_size': args.effective_batch_size,
-    }
-
-    use_fp16 = args.deepspeed_fp16
-    use_amp = not args.deepspeed_fp16  # by default, if not use deepspeed fp16, will enable deepspeed amp
-
-    if use_amp:
-        config_params['amp'] = {
-            'enabled': True,
-            'opt_level': f'O{args.amp_opt_level}',
-        }
-
-    if use_fp16:
-        config_params['fp16'] = {
-            'enabled': True,
-        }
-
-    gradient_clip = args.max_grad_norm
-    if gradient_clip:
-        config_params['gradient_clipping'] = gradient_clip
-
-    config_params['flops_profiler'] = {
-        'enabled': False,
-        'profile_step': 1,
-        'module_depth': -1,
-        'top_modules': 3,
-        'detailed': True,
-    }
-
-    config_params['logging'] = {
-        'steps_per_print': args.logging_steps * 10,
-    }
-    if hasattr(args, "zero_opt_stage") and args.zero_opt_stage > 0:
-        config_params['zero_optimization'] = {
-            'stage': args.zero_opt_stage,
-        }
-        if args.zero_opt_stage > 0:
-            config_params['fp16'] = {'enabled': True}
-        config_params['zero_allow_untested_optimizer'] = True
-
-    logger.info(pformat(config_params))
-    return config_params
-
-
-def fp32_to_fp16(inputs):
-    # deepspeed does not auto cast inputs.
-    for k, v in inputs.items():
-        if isinstance(v, torch.Tensor) and v.dtype == torch.float32:
-            v = v.to(dtype=torch.half)
-        inputs[k] = v
-    return inputs
diff --git a/AVLFormer/src/utils/latex_writer.py b/AVLFormer/src/utils/latex_writer.py
deleted file mode 100644
index f8a3398..0000000
--- a/AVLFormer/src/utils/latex_writer.py
+++ /dev/null
@@ -1,273 +0,0 @@
-def print_csv_table(r, rows, cols):
-    '''
-    r: r[row][col]
-    '''
-    all_line = []
-    all_line.append(',' + ','.join(map(str, cols)))
-
-    def quote_if_comma(p):
-        if ',' in p:
-            return '"' + p + '"'
-        else:
-            return p
-
-    for row in rows:
-        parts = []
-        parts.append(row)
-        for col in cols:
-            parts.append(r[row][col])
-        all_line.append(','.join(map(quote_if_comma, map(str, parts))))
-    return '\n'.join(all_line)
-
-
-def cartesian_index(sizes):
-    index = [0] * len(sizes)
-    yield index
-
-    def is_end(index, sizes):
-        for i in range(len(index)):
-            if index[i] != sizes[i] - 1:
-                return False
-        return True
-
-    while not is_end(index, sizes):
-        found = -1
-        for i in range(len(index) - 1, -1, -1):
-            if index[i] == sizes[i] - 1:
-                index[i] = 0
-            else:
-                found = i
-                break
-        if found != -1:
-            index[found] = index[found] + 1
-            yield index
-
-
-def _spans(sizes):
-    span = []
-    for i, s in enumerate(sizes):
-        x = 1
-        for j in range(i + 1, len(sizes)):
-            x = x * sizes[j]
-        span.append(x)
-    return span
-
-
-def _dup(sizes):
-    dup = []
-    k = 1
-    for i in range(len(sizes)):
-        dup.append(k)
-        k = k * sizes[i]
-    return dup
-
-
-def _extract(r, names, index):
-    x = r
-    for i in range(len(index)):
-        k = names[i][index[i]]
-        if k not in x:
-            return ''
-        x = x[k]
-    return x
-
-
-def print_latex_table(r, all_rows, all_cols, **kwargs):
-    return print_m_table(r, all_rows, all_cols, **kwargs)
-
-
-def print_simple_latex_table(all_a2b,
-                             keys,
-                             caption=None,
-                             label=None,
-                             span_two=False,
-                             interval=None):
-    lines = []
-    if span_two:
-        lines.append('\\begin{table*}')
-    else:
-        lines.append('\\begin{table}')
-    lines.append('\\centering')
-    if caption:
-        lines.append('\\caption{{{}}}'.format(caption))
-    if label:
-        lines.append('\\label{{{}}}'.format(label))
-    num_cols = 0
-    for k in keys:
-        if isinstance(k, str):
-            num_cols += 1
-        else:
-            assert isinstance(k, dict) and len(k) == 1
-            num_cols += len(k[list(k.keys())[0]])
-    if interval:
-        assert len(interval) == num_cols - 1
-        column_config = ''.join([
-            'c@{{{}}}'.format('~' * i) if i is not None else 'c'
-            for i in interval
-        ])
-        column_config += 'c'
-    else:
-        column_config = 'c' * num_cols
-    line = '\\begin{{tabular}}{{{}}}'.format(column_config)
-    lines.append(line)
-    lines.append('\\toprule')
-
-    if any(isinstance(k, dict) for k in keys):
-
-        def get_first(k):
-            if isinstance(k, str):
-                return ''
-                #return '\multirow{{2}}{{*}}{{{}}}'.format(k)
-            else:
-                sub_num_cols = len(k[list(k.keys())[0]])
-                return '\multicolumn{{{}}}{{c}}{{{}}}'.format(
-                    sub_num_cols,
-                    list(k.keys())[0])
-
-        def get_seconds(k):
-            if isinstance(k, str):
-                return [k]
-            else:
-                return k[list(k.keys())[0]]
-
-        line = ' & '.join([get_first(k) for k in keys])
-        line = line + '\\\\'
-        lines.append(line)
-        start = 1
-        for k in keys:
-            if isinstance(k, dict):
-                sub_len = len(k[list(k.keys())[0]])
-                line = '\\cmidrule(lr){{{}-{}}}'.format(
-                    start, start + sub_len - 1)
-                lines.append(line)
-                start += sub_len
-            else:
-                start += 1
-
-        line = ' & '.join(sk for k in keys for sk in get_seconds(k))
-        line = line + '\\\\'
-        lines.append(line)
-        keys2 = []
-        for k in keys:
-            if isinstance(k, str):
-                keys2.append(k)
-            else:
-                for s_k, s_v in k.items():
-                    for a in s_v:
-                        keys2.append(s_k + '$' + a)
-        keys = keys2
-    else:
-        line = ' & '.join(keys)
-        line = line + '\\\\'
-        lines.append(line)
-    lines.append('\\midrule')
-
-    for a2b in all_a2b:
-        line = ' & '.join(map(str, [a2b.get(k, '') for k in keys]))
-        line = line + '\\\\'
-        lines.append(line)
-        if a2b.get('__add_line_after'):
-            lines.append('\\midrule')
-
-    lines.append('\\bottomrule')
-    lines.append('\\end{tabular}')
-    if span_two:
-        lines.append('\\end{table*}')
-    else:
-        lines.append('\\end{table}')
-
-    return '\n'.join(lines)
-
-
-def print_m_table(r, all_rows, all_cols, caption=None, compact=False):
-    sizes_cols = list(map(len, all_cols))
-    cols_dup = _dup(sizes_cols)
-    cols_span = _spans(sizes_cols)
-    num_cols = cols_span[0] * sizes_cols[0] + len(all_rows)
-
-    lines = []
-    lines.append('\\begin{table*}')
-    lines.append('\\centering')
-    if caption:
-        lines.append('\\caption{{{}}}'.format(caption))
-        lines.append('\\label{{{}}}'.format(caption.replace(' ', '_')))
-    if compact:
-        c = '@{~}c'
-    else:
-        c = 'c'
-    line = '\\begin{{tabular}}{{{}@{{}}}}'.format(c * num_cols)
-    lines.append(line)
-    lines.append('\\toprule')
-
-    for i in range(len(all_cols)):
-        line = ''
-        for j in range(len(all_rows) - 1):
-            line = line + '&'
-        s = cols_span[i]
-        for j in range(cols_dup[i]):
-            for k in range(len(all_cols[i])):
-                if s == 1:
-                    line = line + '&{}'.format(all_cols[i][k])
-                else:
-                    line = line + '&\multicolumn{{{0}}}{{c}}{{{1}}}'.format(
-                        s, all_cols[i][k])
-        line = line + '\\\\'
-        lines.append(line)
-        lines.append('\\midrule')
-    sizes_rows = list(map(len, all_rows))
-    rows_span = _spans(sizes_rows)
-    digit_format = '&{}'
-    for index in cartesian_index(sizes_rows):
-        line = ''
-        for i in range(len(index)):
-            prefix = '' if i == 0 else '&'
-            if all(v == 0 for v in index[i + 1:]):
-                if rows_span[i] == 1:
-                    line = '{}{}{}'.format(line, prefix, all_rows[i][index[i]])
-                else:
-                    line = line + prefix + \
-                            '\multirow{{{0}}}{{*}}{{{1}}}'.format(rows_span[i],
-                                            all_rows[i][index[i]])
-            else:
-                if rows_span[i] == 1:
-                    line = '{}{}{}'.format(line, prefix, all_rows[i][index[i]])
-        for col_index in cartesian_index(sizes_cols):
-            value = _extract(_extract(r, all_rows, index), all_cols, col_index)
-            line = line + digit_format.format(value)
-        line = line + '\\\\'
-        lines.append(line)
-        is_end_first_index = True
-        for i in range(1, len(index)):
-            if index[i] != sizes_rows[i] - 1:
-                is_end_first_index = False
-                break
-        if is_end_first_index:
-            if index[0] != sizes_rows[0] - 1:
-                lines.append('\\midrule')
-
-    lines.append('\\bottomrule')
-    lines.append('\\end{tabular}')
-    lines.append('\\end{table*}')
-
-    return '\n'.join(lines)
-
-
-def print_table(r, rows, cols):
-    return print_m_table(r, [rows], [cols])
-
-
-def test_print_m_table():
-    r = {}
-    r['dog'] = {}
-    r['dog']['dog1'] = {}
-    r['dog']['dog1']['s'] = {}
-    r['dog']['dog1']['s']['s1'] = 0
-    r['dog']['dog1']['s']['s2'] = 1
-    r['dog']['dog2'] = {}
-    r['dog']['dog2']['s'] = {}
-    r['dog']['dog2']['s']['s1'] = 2
-    r['dog']['dog2']['s']['s2'] = 3
-
-    import logging
-    logging.info(
-        print_m_table(r, [['dog'], ['dog1', 'dog2']], [['s'], ['s1', 's2']]))
diff --git a/AVLFormer/src/utils/load_files.py b/AVLFormer/src/utils/load_files.py
deleted file mode 100755
index 1399bb9..0000000
--- a/AVLFormer/src/utils/load_files.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from collections import OrderedDict
-import errno
-import json
-import os
-import os.path as op
-
-import yaml
-
-
-def load_labelmap_file(labelmap_file):
-    label_dict = None
-
-    if labelmap_file.endswith('json'):
-        label_dict = json.load(open(labelmap_file, 'r'))['label_to_idx']
-        label_dict = {key: val - 1 for key, val in label_dict.items()}
-        return label_dict
-
-    if labelmap_file is not None and op.isfile(labelmap_file):
-        label_dict = OrderedDict()
-        with open(labelmap_file, 'r') as fp:
-            for line in fp:
-                label = line.strip().split('\t')[0]
-                if label in label_dict:
-                    raise ValueError("Duplicate label " + label +
-                                     " in labelmap.")
-                else:
-                    label_dict[label] = len(label_dict)
-    return label_dict
-
-
-def config_dataset_file(data_dir, dataset_file):
-    if dataset_file:
-        if op.isfile(dataset_file):
-            dataset_file = dataset_file
-        elif op.isfile(op.join(data_dir, dataset_file)):
-            dataset_file = op.join(data_dir, dataset_file)
-        else:
-            raise ValueError("cannot find file: {}".format(dataset_file))
-    return dataset_file
-
-
-def load_linelist_file(linelist_file):
-    if linelist_file is not None:
-        line_list = []
-        with open(linelist_file, 'r') as fp:
-            for i in fp:
-                line_list.append(int(i.strip()))
-        return line_list
-
-
-def load_box_linelist_file(linelist_file):
-    if linelist_file is not None:
-        img_line_list = []
-        box_line_list = []
-        with open(linelist_file, 'r') as fp:
-            for i in fp:
-                idx = [int(_) for _ in i.strip().split('\t')]
-                img_line_list.append(idx[0])
-                box_line_list.append(idx[1])
-        return [img_line_list, box_line_list]
-
-
-def load_from_yaml_file(yaml_file):
-    with open(yaml_file, 'r') as fp:
-        return yaml.load(fp, Loader=yaml.CLoader)
-
-
-def find_file_path_in_yaml(fname, root):
-    if fname is not None:
-        if op.isfile(fname):
-            return fname
-        elif op.isfile(op.join(root, fname)):
-            return op.join(root, fname)
-        else:
-            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
-                                    op.join(root, fname))
diff --git a/AVLFormer/src/utils/load_save.py b/AVLFormer/src/utils/load_save.py
deleted file mode 100644
index 084ec58..0000000
--- a/AVLFormer/src/utils/load_save.py
+++ /dev/null
@@ -1,362 +0,0 @@
-"""
-saving utilities
-"""
-import json
-import os
-from os.path import dirname, exists, join, realpath
-
-from apex import amp
-from easydict import EasyDict as edict
-from src.utils.basic_utils import make_zipfile, save_json
-from src.utils.logger import LOGGER
-import torch
-
-from .basic_utils import is_jsonable
-
-
-def save_training_meta(args):
-    # args is an EasyDict object, treat it the same as a normal dict
-    os.makedirs(join(args.output_dir, 'log'), exist_ok=True)
-    os.makedirs(join(args.output_dir, 'ckpt'), exist_ok=True)
-
-    # training args
-    save_args_path = join(args.output_dir, 'log', 'args.json')
-    save_json(args, save_args_path, save_pretty=True)
-
-    # model args
-    model_config = json.load(open(args.model_config))
-    save_model_config_path = join(args.output_dir, 'log', 'model_config.json')
-    save_json(model_config, save_model_config_path, save_pretty=True)
-
-    # save a copy of the codebase. !!!Do not store heavy file in your codebase when using it.
-    code_dir = dirname(dirname(dirname(realpath(__file__))))
-    code_zip_filename = os.path.join(args.output_dir, "code.zip")
-    LOGGER.info(f"Saving code from {code_dir} to {code_zip_filename}...")
-    make_zipfile(
-        code_dir,
-        code_zip_filename,
-        enclosing_dir="code",
-        exclude_dirs_substring="results",
-        exclude_dirs=["results", "debug_results", "__pycache__", "linjli"],
-        exclude_extensions=[".pyc", ".ipynb", ".swap"])
-    LOGGER.info(f"Saving code done.")
-
-
-class TrainingSaver(object):
-
-    def __init__(self, output_dir):
-        self.output_dir = output_dir
-        self.max_save_load_trial = 10
-
-    def save_tokenizer(self, tokenizer):
-        tokenizer_dir = join(self.output_dir, 'tokenizer')
-        os.makedirs(tokenizer_dir, exist_ok=True)
-        if tokenizer is not None:
-            tokenizer.save_pretrained(tokenizer_dir)
-
-    def save_args(self, args):
-        arg_dir = join(self.output_dir, 'log')
-        os.makedirs(arg_dir, exist_ok=True)
-        save_args_path = join(arg_dir, 'args.json')
-        LOGGER.info(f"Training/evaluation parameters: {args}")
-        LOGGER.info(f"saving args to {save_args_path}")
-        temp_args = edict(vars(args))
-        for key, value in temp_args.items():
-            if not is_jsonable(value):
-                value = f'{value}'
-                temp_args[key] = value
-        save_json(temp_args, save_args_path, save_pretty=True, sort_keys=True)
-
-    def save_model(self, checkpoint_dir, step, model, optimizer=None):
-        os.makedirs(checkpoint_dir, exist_ok=True)
-        model_path = join(checkpoint_dir, 'model.bin')
-        model_to_save = model.module if hasattr(model, 'module') else model
-        state_dict = {
-            k: v.cpu() if isinstance(v, torch.Tensor) else v
-            for k, v in model_to_save.state_dict().items()
-        }
-        # with retrial, as azure blob fails occasionally.
-        save_trial = 0
-        while save_trial < self.max_save_load_trial:
-            exception_msg = ''
-            try:
-                LOGGER.info(f"ModelSaver save trial NO. {save_trial}")
-                torch.save(state_dict, model_path)
-                if optimizer is not None:
-                    optimizer_state_dict = {
-                        k: v.cpu() if isinstance(v, torch.Tensor) else v
-                        for k, v in optimizer.state_dict().items()
-                    }
-                    dump = {'step': step, 'optimizer': optimizer_state_dict}
-                    torch.save(dump, f'{checkpoint_dir}/optmizer_state.bin')
-                LOGGER.info(f"Save checkpoint to {checkpoint_dir}")
-                break
-            except Exception as e:
-                exception_msg = e
-                save_trial += 1
-        else:
-            LOGGER.info(
-                f"Failed to save checkpoint after {self.max_save_load_trial} trails, "
-                f"exception msg: {exception_msg}.")
-        return
-
-
-def load_state_dict_with_mismatch(model, loaded_state_dict_or_path):
-    """operated in-place, no need to return `model`"""
-
-    if isinstance(loaded_state_dict_or_path, str):
-        loaded_state_dict = torch.load(loaded_state_dict_or_path,
-                                       map_location="cpu")
-    else:
-        loaded_state_dict = loaded_state_dict_or_path
-    model_keys = set([k for k in list(model.state_dict().keys())])
-    load_keys = set(loaded_state_dict.keys())
-
-    toload = {}
-    mismatched_shape_keys = []
-    for k in model_keys:
-        if k in load_keys:
-            if model.state_dict()[k].shape != loaded_state_dict[k].shape:
-                mismatched_shape_keys.append(k)
-            else:
-                toload[k] = loaded_state_dict[k]
-
-    LOGGER.info(
-        "You can ignore the keys with `num_batches_tracked` or from task heads"
-    )
-    LOGGER.info("Keys in loaded but not in model:")
-    diff_keys = load_keys.difference(model_keys)
-    LOGGER.info(f"In total {len(diff_keys)}, {sorted(diff_keys)}")
-    LOGGER.info("Keys in model but not in loaded:")
-    diff_keys = model_keys.difference(load_keys)
-    LOGGER.info(f"In total {len(diff_keys)}, {sorted(diff_keys)}")
-    LOGGER.info("Keys in model and loaded, but shape mismatched:")
-    LOGGER.info(
-        f"In total {len(mismatched_shape_keys)}, {sorted(mismatched_shape_keys)}"
-    )
-    model.load_state_dict(toload, strict=False)
-
-
-def compare_dict_difference(dict1,
-                            dict2,
-                            dict1_name="dict1",
-                            dict2_name="dict2",
-                            print_value_diff=True,
-                            verbose=False,
-                            exclude_keys=()):
-    """
-    Args:
-        dict1:
-        dict2:
-        dict1_name:
-        dict2_name:
-        print_value_diff: bool, output dict value difference within shared keys
-            for dict1 and dict2. In effect only when verbose == True
-        verbose:
-    """
-    exclude_keys = set(exclude_keys)
-    keys1 = set(dict1.keys()).difference(exclude_keys)
-    keys2 = set(dict2.keys()).difference(exclude_keys)
-    shared_keys = keys1.intersection(keys2)
-    keys1_unique = keys1.difference(shared_keys)
-    keys2_unique = keys2.difference(shared_keys)
-    key_diff_list = list(keys1_unique) + list(keys2_unique)
-
-    # value difference in the shared keys in dict1 and dict2
-    value_diff_dict = {}
-    for k in shared_keys:
-        if dict1[k] != dict2[k]:
-            value_diff_dict[k] = [(dict1_name, dict1[k]),
-                                  (dict2_name, dict2[k])]
-
-    if len(value_diff_dict) == 0 and len(key_diff_list) == 0:
-        return True
-
-    def print_value_diff():
-        if verbose and print_value_diff:
-            LOGGER.info("=" * 30 + "value difference")
-            LOGGER.info(f"{json.dumps(value_diff_dict, indent=4)}")
-
-    if len(value_diff_dict) > 0 and len(key_diff_list) == 0:
-        # OK
-        print_value_diff()
-        return True
-
-    if verbose:
-        LOGGER.info("=" * 30 + "key difference")
-        LOGGER.info(f"keys in {dict1_name} but not in {dict2_name}: "
-                    f"total {len(keys1_unique)}, {sorted(keys1_unique)}")
-        LOGGER.info(f"keys in {dict2_name} but not in {dict1_name}: "
-                    f"total {len(keys2_unique)}, {sorted(keys2_unique)}")
-    return False
-
-
-def _to_cuda(state):
-    """ usually load from cpu checkpoint but need to load to cuda """
-    if isinstance(state, torch.Tensor):
-        ret = state.cuda()  # assume propoerly set py torch.cuda.set_device
-        if 'Half' in state.type():
-            ret = ret.float()  # apex O2 requires it
-        return ret
-    elif isinstance(state, list):
-        new_state = [_to_cuda(t) for t in state]
-    elif isinstance(state, tuple):
-        new_state = tuple(_to_cuda(t) for t in state)
-    elif isinstance(state, dict):
-        new_state = {n: _to_cuda(t) for n, t in state.items()}
-    else:
-        return state
-    return new_state
-
-
-def _to_cpu(state):
-    """ store in cpu to avoid GPU0 device, fp16 to save space """
-    if isinstance(state, torch.Tensor):
-        ret = state.cpu()
-        if 'Float' in state.type():
-            ret = ret.half()
-        return ret
-    elif isinstance(state, list):
-        new_state = [_to_cpu(t) for t in state]
-    elif isinstance(state, tuple):
-        new_state = tuple(_to_cpu(t) for t in state)
-    elif isinstance(state, dict):
-        new_state = {n: _to_cpu(t) for n, t in state.items()}
-    else:
-        return state
-    return new_state
-
-
-class TrainingRestorer(object):
-
-    def __init__(self, args, model, optimizer):
-        if exists(f"{args.output_dir}/log/args.json"):
-            restore_args = json.load(
-                open(f'{args.output_dir}/log/args.json', 'r'))
-            restore_args_path = join(args.output_dir, 'log',
-                                     'restore_args.json')
-            temp_args = edict(vars(restore_args))
-            for key, value in temp_args.items():
-                if not is_jsonable(value):
-                    value = f'{value}'
-                    temp_args[key] = value
-            save_json(temp_args,
-                      restore_args_path,
-                      save_pretty=True,
-                      sort_keys=True)
-            assert compare_dict_difference(args,
-                                           restore_args,
-                                           dict1_name="current_args",
-                                           dict2_name="restore_args",
-                                           print_value_diff=True,
-                                           verbose=True,
-                                           exclude_keys=('local_rank'))
-        # keep 2 checkpoints in case of corrupted
-        self.save_path = f'{args.output_dir}/restore.pt'
-        self.backup_path = f'{args.output_dir}/restore_backup.pt'
-        self.model = model
-        self.optimizer = optimizer
-        self.min_restore_steps = 20
-        self.restorer_save_step = max(
-            self.min_restore_steps,
-            int(args.restore_ratio * args.max_global_step))
-        # since saving to or loading from azure blob fails sometimes
-        self.max_save_load_trial = 10
-        self.amp = args.mixed_precision_method == "apex"
-        self.deepspeed = args.mixed_precision_method == "deepspeed" and args.restore_deepspeed_ckpt
-        if self.deepspeed:
-            self.save_path = f'{args.output_dir}/deepspeed_restore'
-            os.makedirs(self.save_path, exist_ok=True)
-            self.backup_path = f'{args.output_dir}/deepspeed_restore_backup'
-            os.makedirs(self.backup_path, exist_ok=True)
-        self.restore_at_init()
-
-    def restore_at_init(self):
-        if self.save_path.endswith(".pt"):
-            save_path = self.save_path
-            backup_path = self.backup_path
-        else:
-            # deepspeed
-            save_path = join(self.save_path, "restore_ckpt.pt")
-            backup_path = join(self.backup_path, "restore_ckpt.pt")
-        if exists(save_path) or exists(backup_path):
-            LOGGER.info('found previous checkpoint. try to resume...')
-            exception_msg = ''
-            # with retrial, as azure blob fails occasionally.
-            restore_trial = 0
-            while restore_trial < self.max_save_load_trial:
-                LOGGER.info(
-                    f"TrainingRestorer restore trial NO. {restore_trial}")
-                # try:
-                self.restore()
-                LOGGER.info(
-                    f"TrainingRestorer restore from global_step {self.global_step}"
-                )
-                break
-            #     except Exception as e:
-            #         exception_msg = e
-            #         restore_trial += 1
-            # else:
-            #     LOGGER.info(
-            #         f"TrainingRestorer restore failed after {self.max_save_load_trial} trails, "
-            #         f"exception msg: {exception_msg}.")
-        else:
-            self.global_step = 0
-
-    def step(self):
-        self.global_step += 1
-        if self.global_step % self.restorer_save_step == 0:
-            # with retrial, as azure blob fails occasionally.
-            save_trial = 0
-            while save_trial < self.max_save_load_trial:
-                LOGGER.info(f"TrainingRestorer save trial NO. {save_trial}")
-                try:
-                    self.save()
-                    break
-                except Exception as e:
-                    save_trial += 1
-
-    def save(self):
-        checkpoint = {'global_step': self.global_step}
-        if not self.deepspeed:
-            # model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
-            checkpoint['model_state_dict'] = _to_cpu(self.model.state_dict())
-            checkpoint['optim_state_dict'] = _to_cpu(
-                self.optimizer.state_dict())
-            if self.amp:
-                checkpoint['amp_state_dict'] = amp.state_dict()
-            if exists(self.save_path):
-                os.rename(self.save_path, self.backup_path)
-            torch.save(checkpoint, self.save_path)
-        else:
-            # deepspeed, not efficient
-            if exists(self.save_path):
-                os.rename(self.save_path, self.backup_path)
-            else:
-                self.model.save_checkpoint(self.save_path)
-                torch.save(checkpoint, join(self.save_path, "restore_ckpt.pt"))
-
-    def restore(self):
-        if not self.deepspeed:
-            try:
-                checkpoint = torch.load(self.save_path)
-            except Exception:
-                checkpoint = torch.load(self.backup_path)
-            self.model.load_state_dict(_to_cuda(
-                checkpoint['model_state_dict']))
-            self.optimizer.load_state_dict(
-                _to_cuda(checkpoint['optim_state_dict']))
-            if self.amp:
-                amp.load_state_dict(checkpoint['amp_state_dict'])
-        else:
-            # deepspeed, not efficient
-            try:
-                checkpoint = torch.load(join(self.save_path,
-                                             "restore_ckpt.pt"))
-                self.model.load_checkpoint(self.save_path)
-            except Exception:
-                checkpoint = torch.load(
-                    join(self.backup_path, "restore_ckpt.pt"))
-                self.model.load_checkpoint(self.backup_path)
-        self.global_step = checkpoint['global_step']
-        LOGGER.info(f'resume training from step {self.global_step}')
diff --git a/AVLFormer/src/utils/logger.py b/AVLFormer/src/utils/logger.py
deleted file mode 100644
index 421edfb..0000000
--- a/AVLFormer/src/utils/logger.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-import logging
-from logging import Handler, StreamHandler, getLevelName
-import os
-
-
-# this class is a copy of logging.FileHandler except we end self.close()
-# at the end of each emit. While closing file and reopening file after each
-# write is not efficient, it allows us to see partial logs when writing to
-# fused Azure blobs, which is very convenient
-class FileHandler(StreamHandler):
-    """
-    A handler class which writes formatted logging records to disk files.
-    """
-
-    def __init__(self, filename, mode='a', encoding=None, delay=False):
-        """
-        Open the specified file and use it as the stream for logging.
-        """
-        # Issue #27493: add support for Path objects to be passed in
-        filename = os.fspath(filename)
-        #keep the absolute path, otherwise derived classes which use this
-        #may come a cropper when the current directory changes
-        self.baseFilename = os.path.abspath(filename)
-        self.mode = mode
-        self.encoding = encoding
-        self.delay = delay
-        if delay:
-            #We don't open the stream, but we still need to call the
-            #Handler constructor to set level, formatter, lock etc.
-            Handler.__init__(self)
-            self.stream = None
-        else:
-            StreamHandler.__init__(self, self._open())
-
-    def close(self):
-        """
-        Closes the stream.
-        """
-        self.acquire()
-        try:
-            try:
-                if self.stream:
-                    try:
-                        self.flush()
-                    finally:
-                        stream = self.stream
-                        self.stream = None
-                        if hasattr(stream, "close"):
-                            stream.close()
-            finally:
-                # Issue #19523: call unconditionally to
-                # prevent a handler leak when delay is set
-                StreamHandler.close(self)
-        finally:
-            self.release()
-
-    def _open(self):
-        """
-        Open the current base file with the (original) mode and encoding.
-        Return the resulting stream.
-        """
-        return open(self.baseFilename, self.mode, encoding=self.encoding)
-
-    def emit(self, record):
-        """
-        Emit a record.
-
-        If the stream was not opened because 'delay' was specified in the
-        constructor, open it before calling the superclass's emit.
-        """
-        if self.stream is None:
-            self.stream = self._open()
-        StreamHandler.emit(self, record)
-        self.close()
-
-    def __repr__(self):
-        level = getLevelName(self.level)
-        return '<%s %s (%s)>' % (self.__class__.__name__, self.baseFilename,
-                                 level)
-
-
-_LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
-_DATE_FMT = '%m/%d/%Y %H:%M:%S'
-logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
-LOGGER = logging.getLogger('__main__')  # this is the global logger
-logging.getLogger('matplotlib.font_manager').disabled = True  # cclin
-
-# def setup_logger(name, save_dir, distributed_rank, filename="log.txt"):
-#     logger = logging.getLogger(name)
-#     logger.setLevel(logging.INFO)
-#     # don't log results for the non-master process
-#     if distributed_rank > 0:
-#         return logger
-#     ch = logging.StreamHandler(stream=sys.stdout)
-#     ch.setLevel(logging.INFO)
-#     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
-#     ch.setFormatter(formatter)
-#     logger.addHandler(ch)
-
-#     if save_dir:
-#         fh = FileHandler(os.path.join(save_dir, filename))
-#         fh.setLevel(logging.INFO)
-#         fh.setFormatter(formatter)
-#         logger.addHandler(fh)
-
-#     logging.getLogger('matplotlib.font_manager').disabled = True  # cclin
-#     return logger
-
-
-def add_log_to_file(log_path):
-    fh = FileHandler(log_path)
-    formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
-    fh.setFormatter(formatter)
-    LOGGER.addHandler(fh)
-
-
-from tensorboardX import SummaryWriter
-
-
-class TensorboardLogger(object):
-
-    def __init__(self):
-        self._logger = None
-        self._global_step = 0
-
-    def create(self, path):
-        self._logger = SummaryWriter(path)
-
-    def noop(self, *args, **kwargs):
-        return
-
-    def step(self):
-        self._global_step += 1
-
-    @property
-    def global_step(self):
-        return self._global_step
-
-    @global_step.setter
-    def global_step(self, step):
-        self._global_step = step
-
-    def log_scalar_dict(self, log_dict, prefix=''):
-        """ log a dictionary of scalar values"""
-        if self._logger is None:
-            return
-        if prefix:
-            prefix = f'{prefix}_'
-        for name, value in log_dict.items():
-            if isinstance(value, dict):
-                self.log_scalar_dict(value,
-                                     self._global_step,
-                                     prefix=f'{prefix}{name}')
-            else:
-                self._logger.add_scalar(f'{prefix}{name}', value,
-                                        self._global_step)
-
-    def __getattr__(self, name):
-        if self._logger is None:
-            return self.noop
-        return self._logger.__getattribute__(name)
-
-
-TB_LOGGER = TensorboardLogger()
-
-
-class RunningMeter(object):
-    """ running meteor of a scalar value
-        (useful for monitoring training loss)
-    """
-
-    def __init__(self, name, val=None, smooth=0.99):
-        self._name = name
-        self._sm = smooth
-        self._val = val
-
-    def __call__(self, value):
-        self._val = (value if self._val is None else value * (1 - self._sm) +
-                     self._val * self._sm)
-
-    def __str__(self):
-        return f'{self._name}: {self._val:.4f}'
-
-    @property
-    def val(self):
-        return self._val
-
-    @property
-    def name(self):
-        return self._name
\ No newline at end of file
diff --git a/AVLFormer/src/utils/metric_logger.py b/AVLFormer/src/utils/metric_logger.py
deleted file mode 100644
index 586cfbd..0000000
--- a/AVLFormer/src/utils/metric_logger.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from collections import defaultdict, deque
-import os
-
-import torch
-
-from .comm import is_main_process
-
-# class SmoothedValue(object):
-#     """Track a series of values and provide access to smoothed values over a
-#     window or the global series average.
-#     """
-
-#     def __init__(self, window_size=20):
-#         self.deque = deque(maxlen=window_size)
-#         # self.series = []
-#         self.total = 0.0
-#         self.count = 0
-
-#     def update(self, value):
-#         self.deque.append(value)
-#         # self.series.append(value)
-#         self.count += 1
-#         self.total += value
-
-#     @property
-#     def median(self):
-#         d = torch.tensor(list(self.deque))
-#         return d.median().item()
-
-#     @property
-#     def avg(self):
-#         d = torch.tensor(list(self.deque))
-#         return d.mean().item()
-
-#     @property
-#     def global_avg(self):
-#         return self.total / self.count
-
-#     @property
-#     def last_value(self):
-#         return self.deque[-1]
-
-# class MetricLogger(object):
-#     def __init__(self, delimiter="\t"):
-#         self.meters = {}
-#         self.params = {}
-#         self.delimiter = delimiter
-
-#     def update_params(self, update_dict):
-#         for param_group, group_dict in update_dict.items():
-#             if param_group not in self.params:
-#                 self.params[param_group] = {}
-#             for param_name, param_value in group_dict.items():
-#                 # skipping parameters if they start with '_'
-#                 if param_name.startswith('_'):
-#                     continue
-#                 if isinstance(param_value, torch.Tensor):
-#                     param_value = param_value.item()
-#                 assert isinstance(param_value, (float, int))
-#                 self.params[param_group][param_name] = param_value
-
-#     def update_metrics(self, update_dict):
-#         for metric_group, group_dict in update_dict.items():
-#             if metric_group not in self.meters:
-#                 self.meters[metric_group] = defaultdict(SmoothedValue)
-#             for metric_name, metric_value in group_dict.items():
-#                 # skipping metrics if they start with '_'
-#                 if metric_name.startswith('_'):
-#                     continue
-#                 if isinstance(metric_value, torch.Tensor):
-#                     metric_value = metric_value.item()
-#                 assert isinstance(metric_value, (float, int))
-#                 self.meters[metric_group][metric_name].update(metric_value)
-
-#     def get_logs(self, iteration):
-#         return_str = []
-#         if len(self.meters) > 0:
-#             offset_m = max([len(group_name) for group_name in self.meters.keys()])
-#         else:
-#             offset_m = 0
-#         if len(self.params) > 0:
-#             offset_p = max([len(group_name) for group_name in self.params.keys()])
-#         else:
-#             offset_p = 0
-#         offset = max(offset_m, offset_p)
-
-#         for group_name, values in sorted(self.meters.items(),
-#                                          key=lambda x: x[0]):
-#             loss_str = []
-#             for name, meter in values.items():
-#                 loss_str.append("{}: {:.4f} ({:.4f})".format(
-#                     name, meter.median, meter.global_avg,
-#                 ))
-#             return_str.append(
-#                 "{:{offset}s} - {}".format(
-#                     group_name, self.delimiter.join(loss_str), offset=offset,
-#                 ),
-#             )
-#         for group_name, values in self.params.items():
-#             loss_str = []
-#             for name, param in values.items():
-#                 loss_str.append("{}: {:.2e}".format(name, param))
-#             return_str.append(
-#                 "{:{offset}s} - {}".format(
-#                     group_name, self.delimiter.join(loss_str), offset=offset,
-#                 ),
-#             )
-#         return "\n    ".join(return_str)
-
-
-class SmoothedValue(object):
-
-    def __init__(self, window_size=10):
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-
-    def update(self, value):
-        self.deque.append(value)
-        self.count += 1
-        self.total += value
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque))
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-
-class MetricLogger(object):
-
-    def __init__(self, delimiter="\t", meter_creator=SmoothedValue):
-        self.meters = defaultdict(meter_creator)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            assert isinstance(v, (float, int))
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, attr))
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append("{}: {:.4f} ({:.4f})".format(
-                name, meter.median, meter.global_avg))
-        return self.delimiter.join(loss_str)
-
-
-class TensorboardLogger(MetricLogger):
-
-    def __init__(self, log_dir, delimiter='\t', philly_log_dir=None):
-        super(TensorboardLogger, self).__init__(delimiter)
-        try:
-            from tensorboardX import SummaryWriter
-        except ImportError:
-            raise ImportError('To use tensorboard please install tensorboardX '
-                              '[ pip install tensorboardx ].')
-        self.philly_tb_logger = None
-        self.philly_tb_logger_avg = None
-        self.philly_tb_logger_med = None
-        if is_main_process():
-            self.tb_logger = SummaryWriter(log_dir)
-            self.tb_logger_avg = SummaryWriter(os.path.join(log_dir, 'avg'))
-            self.tb_logger_med = SummaryWriter(os.path.join(log_dir, 'med'))
-            if philly_log_dir is not None:
-                self.philly_tb_logger = SummaryWriter(philly_log_dir)
-                self.philly_tb_logger_avg = SummaryWriter(
-                    os.path.join(philly_log_dir, 'avg'))
-                self.philly_tb_logger_med = SummaryWriter(
-                    os.path.join(philly_log_dir, 'med'))
-        else:
-            self.tb_logger = None
-            self.tb_logger_avg = None
-            self.tb_logger_med = None
-
-    def get_logs(self, iteration):
-        if self.tb_logger:
-            for group_name, values in self.meters.items():
-                for name, meter in values.items():
-                    self.tb_logger.add_scalar(
-                        '{}/{}'.format(group_name, name),
-                        meter.last_value,
-                        iteration,
-                    )
-                    self.tb_logger_avg.add_scalar(
-                        '{}/{}'.format(group_name, name),
-                        meter.avg,
-                        iteration,
-                    )
-                    self.tb_logger_med.add_scalar(
-                        '{}/{}'.format(group_name, name),
-                        meter.median,
-                        iteration,
-                    )
-                    if self.philly_tb_logger:
-                        self.philly_tb_logger.add_scalar(
-                            '{}/{}'.format(group_name, name),
-                            meter.last_value,
-                            iteration,
-                        )
-                        self.philly_tb_logger_avg.add_scalar(
-                            '{}/{}'.format(group_name, name),
-                            meter.avg,
-                            iteration,
-                        )
-                        self.philly_tb_logger_med.add_scalar(
-                            '{}/{}'.format(group_name, name),
-                            meter.median,
-                            iteration,
-                        )
-            for group_name, values in self.params.items():
-                for name, param in values.items():
-                    self.tb_logger.add_scalar(
-                        '{}/{}'.format(group_name, name),
-                        param,
-                        iteration,
-                    )
-                    if self.philly_tb_logger:
-                        self.philly_tb_logger.add_scalar(
-                            '{}/{}'.format(group_name, name),
-                            param,
-                            iteration,
-                        )
-        return super(TensorboardLogger, self).get_logs(iteration)
-
-    def close(self):
-        if is_main_process():
-            self.tb_logger.close()
-            self.tb_logger_avg.close()
-            self.tb_logger_med.close()
-            if self.philly_tb_logger:
-                self.philly_tb_logger.close()
-                self.philly_tb_logger_avg.close()
-                self.philly_tb_logger_med.close()
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
diff --git a/AVLFormer/src/utils/miscellaneous.py b/AVLFormer/src/utils/miscellaneous.py
deleted file mode 100644
index 0379021..0000000
--- a/AVLFormer/src/utils/miscellaneous.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-import errno
-import json
-import logging
-import os
-import os.path as op
-from pprint import pformat
-import random
-import re
-import shutil
-
-from easydict import EasyDict as edict
-import numpy as np
-import torch
-import yaml
-
-from .basic_utils import save_json
-from .comm import is_main_process
-from .logger import LOGGER as logger
-
-
-def ensure_directory(path):
-    if path == '' or path == '.':
-        return
-    if path != None and len(path) > 0:
-        assert not op.isfile(path), '{} is a file'.format(path)
-        if not os.path.exists(path) and not op.islink(path):
-            try:
-                os.makedirs(path)
-            except:
-                if os.path.isdir(path):
-                    # another process has done makedir
-                    pass
-                else:
-                    raise
-
-
-def get_user_name():
-    import getpass
-    return getpass.getuser()
-
-
-def acquireLock(lock_f='/tmp/lockfile.LOCK'):
-    ''' acquire exclusive lock file access '''
-    import fcntl
-    locked_file_descriptor = open(lock_f, 'w+')
-    fcntl.lockf(locked_file_descriptor, fcntl.LOCK_EX)
-    return locked_file_descriptor
-
-
-def releaseLock(locked_file_descriptor):
-    ''' release exclusive lock file access '''
-    locked_file_descriptor.close()
-
-
-def hash_sha1(s):
-    import hashlib
-    if type(s) is not str:
-        s = pformat(s)
-    return hashlib.sha1(s.encode('utf-8')).hexdigest()
-
-
-def print_trace():
-    import traceback
-    traceback.print_exc()
-
-
-def limited_retry_agent(num, func, *args, **kwargs):
-    for i in range(num):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            logging.info('fails with \n{}: tried {}-th time'.format(e, i + 1))
-            import time
-            print_trace()
-            if i == num - 1:
-                raise
-            time.sleep(5)
-
-
-def exclusive_open_to_read(fname, mode='r'):
-    disable_lock = os.environ.get('QD_DISABLE_EXCLUSIVE_READ_BY_LOCK')
-    if disable_lock is not None:
-        disable_lock = int(disable_lock)
-    if not disable_lock:
-        user_name = get_user_name()
-        lock_fd = acquireLock(
-            op.join('/tmp', '{}_lock_{}'.format(user_name, hash_sha1(fname))))
-    #try:
-    # in AML, it could fail with Input/Output error. If it fails, we will
-    # use azcopy as a fall back solution for reading
-    fp = limited_retry_agent(10, open, fname, mode)
-    #except:
-    #if 'FILE_OPEN_AZCOPY_BLOB_ACCOUNT_PATH' in os.environ:
-    #return azcopy_read(fname)
-    #else:
-    #raise
-    if not disable_lock:
-        releaseLock(lock_fd)
-    return fp
-
-
-class NoOp(object):
-    """ useful for distributed training No-Ops """
-
-    def __getattr__(self, name):
-        return self.noop
-
-    def noop(self, *args, **kwargs):
-        return
-
-
-def str_to_bool(value):
-    if value.lower() in {'false', 'f', '0', 'no', 'n'}:
-        return False
-    elif value.lower() in {'true', 't', '1', 'yes', 'y'}:
-        return True
-    raise ValueError(f'{value} is not a valid boolean value')
-
-
-def mkdir(path):
-    # if it is the current folder, skip.
-    # otherwise the original code will raise FileNotFoundError
-    if path == '':
-        return
-    try:
-        os.makedirs(path)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def save_config(cfg, path):
-    if is_main_process():
-        with open(path, 'w') as f:
-            f.write(cfg.dump())
-
-
-def config_iteration(output_dir, max_iter):
-    save_file = os.path.join(output_dir, 'last_checkpoint')
-    iteration = -1
-    if os.path.exists(save_file):
-        with open(save_file, 'r') as f:
-            fname = f.read().strip()
-        model_name = os.path.basename(fname)
-        model_path = os.path.dirname(fname)
-        if model_name.startswith('model_') and len(model_name) == 17:
-            iteration = int(model_name[-11:-4])
-        elif model_name == "model_final":
-            iteration = max_iter
-        elif model_path.startswith('checkpoint-') and len(model_path) == 18:
-            iteration = int(model_path.split('-')[-1])
-    return iteration
-
-
-def get_matching_parameters(model, regexp, none_on_empty=True):
-    """Returns parameters matching regular expression"""
-    if not regexp:
-        if none_on_empty:
-            return {}
-        else:
-            return dict(model.named_parameters())
-    compiled_pattern = re.compile(regexp)
-    params = {}
-    for weight_name, weight in model.named_parameters():
-        if compiled_pattern.match(weight_name):
-            params[weight_name] = weight
-    return params
-
-
-def freeze_weights(model, regexp):
-    """Freeze weights based on regular expression."""
-    for weight_name, weight in get_matching_parameters(model, regexp).items():
-        weight.requires_grad = False
-        logger.info("Disabled training of {}".format(weight_name))
-
-
-def unfreeze_weights(model,
-                     regexp,
-                     backbone_freeze_at=-1,
-                     is_distributed=False):
-    """
-    WARNING: This is not fully tested and may have issues. Now it is not used 
-    during training but keep it here for future reference. 
-    Unfreeze weights based on regular expression.
-    This is helpful during training to unfreeze freezed weights after
-    other unfreezed weights have been trained for some iterations.
-    """
-    for weight_name, weight in get_matching_parameters(model, regexp).items():
-        weight.requires_grad = True
-        logger.info("Enabled training of {}".format(weight_name))
-    if backbone_freeze_at >= 0:
-        logger.info("Freeze backbone at stage: {}".format(backbone_freeze_at))
-        if is_distributed:
-            model.module.backbone.body._freeze_backbone(backbone_freeze_at)
-        else:
-            model.backbone.body._freeze_backbone(backbone_freeze_at)
-
-
-def delete_tsv_files(tsvs):
-    for t in tsvs:
-        if op.isfile(t):
-            try_delete(t)
-        line = op.splitext(t)[0] + '.lineidx'
-        if op.isfile(line):
-            try_delete(line)
-
-
-def concat_files(ins, out):
-    mkdir(op.dirname(out))
-    out_tmp = out + '.tmp'
-    with open(out_tmp, 'wb') as fp_out:
-        for i, f in enumerate(ins):
-            logging.info('concating {}/{} - {}'.format(i, len(ins), f))
-            with open(f, 'rb') as fp_in:
-                shutil.copyfileobj(fp_in, fp_out, 1024 * 1024 * 10)
-    os.rename(out_tmp, out)
-
-
-def concat_tsv_files(tsvs, out_tsv):
-    concat_files(tsvs, out_tsv)
-    sizes = [os.stat(t).st_size for t in tsvs]
-    sizes = np.cumsum(sizes)
-    all_idx = []
-    for i, t in enumerate(tsvs):
-        for idx in load_list_file(op.splitext(t)[0] + '.lineidx'):
-            if i == 0:
-                all_idx.append(idx)
-            else:
-                all_idx.append(str(int(idx) + sizes[i - 1]))
-    with open(op.splitext(out_tsv)[0] + '.lineidx', 'w') as f:
-        f.write('\n'.join(all_idx))
-
-
-def load_list_file(fname):
-    with open(fname, 'r') as fp:
-        lines = fp.readlines()
-    result = [line.strip() for line in lines]
-    if len(result) > 0 and result[-1] == '':
-        result = result[:-1]
-    return result
-
-
-def try_once(func):
-
-    def func_wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            logging.info('ignore error \n{}'.format(str(e)))
-
-    return func_wrapper
-
-
-@try_once
-def try_delete(f):
-    os.remove(f)
-
-
-def set_seed(seed, n_gpu):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(seed)
-
-
-def print_and_run_cmd(cmd):
-    print(cmd)
-    os.system(cmd)
-
-
-def write_to_yaml_file(context, file_name):
-    with open(file_name, 'w') as fp:
-        yaml.dump(context, fp, encoding='utf-8')
-
-
-def load_from_yaml_file(yaml_file):
-    with open(yaml_file, 'r') as fp:
-        return yaml.load(fp, Loader=yaml.CLoader)
-
-
-def parse_yaml_file(yaml_file):
-    r = re.compile('.*fea.*lab.*.yaml')
-    temp = op.basename(yaml_file).split('.')
-    split_name = temp[0]
-    if r.match(yaml_file) is not None:
-        fea_folder = '.'.join(temp[temp.index('fea') + 1:temp.index('lab')])
-        lab_folder = '.'.join(temp[temp.index('lab') + 1:-1])
-    else:
-        fea_folder, lab_folder = None, None
-    return split_name, fea_folder, lab_folder
-
-
-def check_yaml_file(yaml_file):
-    # check yaml file, generate if possible
-    if not op.isfile(yaml_file):
-        try:
-            split_name, fea_folder, lab_folder = parse_yaml_file(yaml_file)
-            if fea_folder and lab_folder:
-                base_yaml_file = op.join(op.dirname(yaml_file),
-                                         split_name + '.yaml')
-                if op.isfile(base_yaml_file):
-                    data = load_from_yaml_file(base_yaml_file)
-                    data['feature'] = op.join(fea_folder,
-                                              split_name + '.feature.tsv')
-                    data['label'] = op.join(lab_folder,
-                                            split_name + '.label.tsv')
-                    assert op.isfile(
-                        op.join(op.dirname(base_yaml_file), data['feature']))
-                    assert op.isfile(
-                        op.join(op.dirname(base_yaml_file), data['label']))
-                    if is_main_process():
-                        write_to_yaml_file(data, yaml_file)
-                        print("generate yaml file: {}".format(yaml_file))
-        except:
-            raise ValueError(
-                "yaml file: {} does not exist and cannot create it".format(
-                    yaml_file))
diff --git a/AVLFormer/src/utils/qd_common.py b/AVLFormer/src/utils/qd_common.py
deleted file mode 100644
index 2f113ad..0000000
--- a/AVLFormer/src/utils/qd_common.py
+++ /dev/null
@@ -1,3821 +0,0 @@
-from collections import OrderedDict
-import functools
-import glob
-from io import BytesIO
-import json
-import logging
-import math
-import multiprocessing as mp
-from multiprocessing import Event, Process
-import os
-import re
-import sys
-import traceback
-
-from PIL import Image
-import numpy as np
-import progressbar
-from tqdm import tqdm
-import yaml
-
-try:
-    from itertools import izip as zip
-except ImportError:
-    # in python3, we don't need itertools.izip since zip is izip
-    pass
-import argparse
-import base64
-from datetime import datetime
-import os.path as op
-from pprint import pformat, pprint
-import re
-import shutil
-import subprocess as sp
-import time
-
-import cv2
-from ete3 import Tree
-from future.utils import viewitems
-import matplotlib.pyplot as plt
-import numpy as np
-import psutil
-
-try:
-    # py3
-    from urllib.request import HTTPError, Request, urlopen
-except ImportError:
-    # py2
-    from urllib2 import urlopen, Request
-    from urllib2 import HTTPError
-
-import copy
-import io
-
-from PIL import ImageFile
-from deprecated import deprecated
-
-#https://stackoverflow.com/questions/12984426/python-pil-ioerror-image-file-truncated-with-big-images
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-
-
-def get_sys_memory_usage_info():
-    out = cmd_run(['free'], return_output=True)
-    import ipdb;ipdb.set_trace(context=15)
-    lines = out.split('\n')
-    headers = lines[0].strip().split(' ')
-    headers = [h for h in headers if len(h) > 0]
-    mem = lines[1]
-    x1, x2 = mem.split(':')
-    assert 'Mem' == x1
-    values = [int(i) for i in x2.split(' ') if len(i) > 0]
-    assert len(headers) == len(values)
-    return dict(zip(headers, values))
-
-def get_mem_usage_in_bytes():
-    import os
-
-    import psutil
-    process = psutil.Process(os.getpid())
-    return process.memory_info().rss  # in bytes
-
-def print_type_memory_usage():
-    from pympler import muppy, summary
-    all_objects = muppy.get_objects()
-    sum1 = summary.summarize(all_objects)
-    summary.print_(sum1)
-
-def encode_np(x):
-    compressed_array = io.BytesIO()
-    np.savez_compressed(compressed_array, x)
-    return base64.b64encode(compressed_array.getvalue())
-
-def decode_np(s):
-    s = base64.b64decode(s)
-    return np.load(io.BytesIO(s))['arr_0']
-
-def print_trace():
-    import traceback
-    traceback.print_exc()
-
-def get_trace():
-    import traceback
-    return traceback.format_exc()
-
-def try_once(func):
-    def func_wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            logging.info('ignore error \n{}'.format(str(e)))
-            print_trace()
-    return func_wrapper
-
-def master_process_run(func):
-    def func_wrapper(*args, **kwargs):
-        if get_mpi_rank() == 0:
-            try:
-                return func(*args, **kwargs)
-            except Exception as e:
-                logging.info('ignore error \n{}'.format(str(e)))
-                print_trace()
-    return func_wrapper
-
-@try_once
-def try_delete(f):
-    os.remove(f)
-
-def list_to_nested_dict(xs, idxes):
-    rest_idxes = set(range(len(xs[0]))).difference(idxes)
-    result = {}
-    for r in xs:
-        curr_result = result
-        for i in idxes[:-1]:
-            if r[i] not in curr_result:
-                curr_result[r[i]] = {}
-            curr_result = curr_result[r[i]]
-        key = r[idxes[-1]]
-        if key not in curr_result:
-            curr_result[key] = []
-        value = [r[i] for i in rest_idxes]
-        if len(value) == 1:
-            value = value[0]
-        curr_result[key].append(value)
-    return result
-
-def make_by_pattern_result(data, pattern_results):
-    for p, result in pattern_results:
-        match_result = re.match(p, data)
-        if match_result is not None:
-            return result
-
-def make_by_pattern_maker(data, pattern_makers):
-    for p, maker in pattern_makers:
-        match_result = re.match(p, data)
-        if match_result is not None:
-            return maker()
-
-def is_positive_uhrs_verified(r):
-    uhrs = r['uhrs']
-    y, n = uhrs.get('1', 0), uhrs.get('2', 0)
-    return y > n
-
-def is_negative_uhrs_verified(r):
-    uhrs = r['uhrs']
-    y, n = uhrs.get('1', 0), uhrs.get('2', 0)
-    return n > y
-
-def find_float_tolorance_unequal(d1, d2):
-    # return a list of string. Each string means a path where the value is
-    # different
-    from past.builtins import basestring
-    if all(isinstance(x, basestring) for x in [d1, d2]) or \
-            all(type(x) is bool for x in [d1, d2]):
-        if d1 != d2:
-            return ['0']
-        else:
-            return []
-    if type(d1) is int and type(d2) is int:
-        if d1 == d2:
-            return []
-        else:
-            return ['0']
-    if type(d1) in [int, float] and type(d2) in [int, float]:
-        equal = abs(d1 - d2) <= 0.00001 * abs(d1)
-        if equal:
-            return []
-        else:
-            return ['0']
-    if isinstance(d1, (dict, OrderedDict)) and isinstance(d2, (dict, OrderedDict)):
-        if len(d1) != len(d2):
-            return ['0']
-        path_d1 = dict_get_all_path(d1, with_type=True)
-        result = []
-        for p in path_d1:
-            v1 = dict_get_path_value(d1, p, with_type=True)
-            if not dict_has_path(d2, p, with_type=True):
-                result.append(p)
-            else:
-                v2 = dict_get_path_value(d2, p, with_type=True)
-                curr_result = find_float_tolorance_unequal(v1, v2)
-                for r in curr_result:
-                    result.append(p + '$' + r)
-        return result
-    if isinstance(d1, np.ndarray) and isinstance(d2, np.ndarray):
-        diff = np.absolute((d1 - d2)).sum()
-        s = np.absolute(d1).sum()
-        equal = diff < 1e-5 * s
-        if equal:
-            return []
-        else:
-            return ['0']
-    if type(d1) in [tuple, list] and type(d2) in [tuple, list]:
-        if len(d1) != len(d2):
-            return ['-1']
-        result = []
-        for i, (x1, x2) in enumerate(zip(d1, d2)):
-            curr_result = find_float_tolorance_unequal(x1, x2)
-            for r in curr_result:
-                result.append('{}${}'.format(i, r))
-        return result
-    if type(d1) != type(d2):
-        return ['0']
-    else:
-        import torch
-        if isinstance(d1, torch.Tensor):
-            diff = (d1 - d2).float().abs().sum()
-            s = d1.float().abs().sum()
-            if float(s) < 1e-5:
-                equal = diff < 1e-5
-            else:
-                equal = float(diff / s) < 1e-5
-            if equal:
-                return []
-            else:
-                return ['0']
-        else:
-            raise Exception('unknown type')
-
-def float_tolorance_equal(d1, d2, check_order=True):
-    from past.builtins import basestring
-    if isinstance(d1, basestring) and isinstance(d2, basestring):
-        return d1 == d2
-    if type(d1) in [int, float] and type(d2) in [int, float]:
-        return abs(d1 - d2) <= 0.00001 * abs(d1)
-    if type(d1) != type(d2) and \
-            (not (type(d1) in [tuple, list] and
-                type(d2) in [tuple, list])):
-        return False
-    if type(d1) in [dict, OrderedDict]:
-        if len(d1) != len(d2):
-            return False
-        for k in d1:
-            if k not in d2:
-                return False
-            v1, v2 = d1[k], d2[k]
-            if not float_tolorance_equal(v1, v2):
-                return False
-        return True
-    elif type(d1) in [tuple, list]:
-        if len(d1) != len(d2):
-            return False
-        if not check_order:
-            d1 = sorted(d1, key=lambda x: pformat(x))
-            d2 = sorted(d2, key=lambda x: pformat(x))
-        for x1, x2 in zip(d1, d2):
-            if not float_tolorance_equal(x1, x2, check_order):
-                return False
-        return True
-    elif type(d1) is bool:
-        return d1 == d2
-    elif d1 is None:
-        return d1 == d2
-    elif type(d1) is datetime:
-        if d1.tzinfo != d2.tzinfo:
-            return d1.replace(tzinfo=d2.tzinfo) == d2
-        else:
-            return d1 == d2
-    elif type(d1) is np.ndarray:
-        if not float_tolorance_equal(d1.shape, d2.shape, check_order=True):
-            return False
-        return np.absolute(d1 - d2).sum() <= 1e-5 * np.absolute(d1).sum()
-    elif type(d1) in [np.float64]:
-        return np.absolute(d1 - d2).sum() <= 1e-5 * np.absolute(d1).sum()
-    else:
-        import torch
-        if type(d1) is torch.Tensor:
-            diff = (d1 - d2).abs().sum()
-            s = d1.abs().sum()
-            if s < 1e-5:
-                return diff < 1e-5
-            else:
-                return diff / s < 1e-5
-        else:
-            raise Exception('unknown type')
-
-def case_incensitive_overlap(all_terms):
-    all_lower_to_term = [{t.lower(): t for t in terms} for terms in all_terms]
-    all_lowers = [set(l.keys()) for l in all_lower_to_term]
-    anchor = all_lowers[0].intersection(*all_lowers[1:])
-
-    return [[lower_to_term[l] for l in anchor]
-        for lower_to_term in all_lower_to_term]
-
-def get_executable():
-    return sys.executable
-
-def collect_process_info():
-    result = {}
-    for process in psutil.process_iter():
-        result[process.pid] = {}
-        result[process.pid]['username'] = process.username()
-        result[process.pid]['time_spent_in_hour'] = (int(time.time()) -
-                process.create_time()) / 3600.0
-        result[process.pid]['cmdline'] = ' '.join(process.cmdline())
-    return result
-
-def remote_run(str_cmd, ssh_info, return_output=False):
-    cmd = ['ssh', '-t', '-t', '-o', 'StrictHostKeyChecking no']
-    for key in ssh_info:
-        if len(key) > 0 and key[0] == '-':
-            cmd.append(key)
-            cmd.append(str(ssh_info[key]))
-    cmd.append('{}@{}'.format(ssh_info['username'], ssh_info['ip']))
-    if is_cluster(ssh_info):
-        prefix = 'source ~/.bashrc && export PATH=/usr/local/nvidia/bin:$PATH && '
-    else:
-        cs = []
-        # don't use anaconda since caffe is slower under anaconda because of the
-        # data preprocessing. not i/o
-        cs.append('source ~/.bashrc')
-        if 'conda' in get_executable():
-            cs.append('export PATH=$HOME/anaconda3/bin:\$PATH')
-            cs.append('export LD_LIBRARY_PATH=$HOME/anaconda3/lib:\$LD_LIBRARY_PATH')
-        cs.append('export PATH=/usr/local/nvidia/bin:\$PATH')
-        cs.append('export OMP_NUM_THREADS=1')
-        prefix = ' && '.join(cs) + ' && '
-
-    suffix = ' && hostname'
-    ssh_command = '{}{}{}'.format(prefix, str_cmd, suffix)
-    # this will use the environment variable like what you have after ssh
-    ssh_command = 'bash -i -c "{}"'.format(ssh_command)
-    cmd.append(ssh_command)
-
-    return cmd_run(cmd, return_output)
-
-def compile_by_docker(src_zip, docker_image, dest_zip):
-    # compile the qd zip file and generate another one by compiling. so that
-    # there is no need to compile it again.
-    src_fname = op.basename(src_zip)
-    src_folder = op.dirname(src_zip)
-
-    docker_src_folder = '/tmpwork'
-    docker_src_zip = op.join(docker_src_folder, src_fname)
-    docker_out_src_fname = src_fname + '.out.zip'
-    docker_out_zip = op.join(docker_src_folder, docker_out_src_fname)
-    out_zip = op.join(src_folder, docker_out_src_fname)
-    docker_compile_folder = '/tmpcompile'
-    cmd = ['docker', 'run',
-            '-v', '{}:{}'.format(src_folder, docker_src_folder),
-            docker_image,
-            ]
-    cmd.append('/bin/bash')
-    cmd.append('-c')
-    compile_cmd = [
-            'mkdir -p {}'.format(docker_compile_folder),
-            'cd {}'.format(docker_compile_folder),
-            'unzip {}'.format(docker_src_zip),
-            'bash compile.aml.sh',
-            'zip -yrv x.zip *',
-            'cp x.zip {}'.format(docker_out_zip),
-            'chmod a+rw {}'.format(docker_out_zip),
-            ]
-    cmd.append(' && '.join(compile_cmd))
-    cmd_run(cmd)
-    ensure_directory(op.dirname(dest_zip))
-    copy_file(out_zip, dest_zip)
-
-def zip_qd(out_zip, options=None):
-    ensure_directory(op.dirname(out_zip))
-    cmd = [
-        'zip',
-        '-uyrv',
-        out_zip,
-        '*',
-    ]
-    if options:
-        cmd.extend(options)
-    else:
-        cmd.extend([
-            '-x',
-            '\*src/CCSCaffe/\*',
-            '-x',
-            '\*src/build/lib.linux-x86_64-2.7/\*',
-            '-x',
-            '\*build/lib.linux-x86_64-2.7/\*',
-            '-x',
-            '\*build/temp.linux-x86_64-2.7/\*',
-            '-x',
-            '\*build/lib.linux-x86_64-3.5/\*',
-            '-x',
-            '\*build/temp.linux-x86_64-3.5/\*',
-            '-x',
-            '\*build/lib.linux-x86_64-3.7/\*',
-            '-x',
-            'assets\*',
-            '-x',
-            '\*build/temp.linux-x86_64-3.7/\*',
-            '-x',
-            '\*build/lib.linux-x86_64-3.6/\*',
-            '-x',
-            '\*build/temp.linux-x86_64-3.6/\*',
-            '-x',
-            '\*src/detectron2/datasets/\*',
-            '-x',
-            '\*src/CCSCaffe/models/\*',
-            '-x',
-            '\*src/CCSCaffe/data/\*',
-            '-x',
-            '\*src/CCSCaffe/examples/\*',
-            '-x',
-            '\*src/detectron2/output\*',
-            '-x',
-            'aux_data/yolo9k/\*',
-            '-x',
-            'visualization\*',
-            '-x',
-            'output\*',
-            '-x',
-            'data\*',
-            '-x',
-            '\*.build_release\*',
-            '-x',
-            '\*.build_debug\*',
-            '-x',
-            '\*.build\*',
-            '-x',
-            '\*tmp_run\*',
-            '-x',
-            '\*src/CCSCaffe/MSVC/\*',
-            '-x',
-            '\*.pyc',
-            '-x',
-            '\*.so',
-            '-x',
-            '\*.o',
-            '-x',
-            '\*src/CCSCaffe/docs/tutorial/\*',
-            '-x',
-            '\*src/CCSCaffe/matlab/\*',
-            '-x',
-            '\*.git\*',
-            '-x',
-            '\*src/qd_classifier/.cache/\*',
-            '\*wandb\*',
-        ])
-    cmd_run(cmd, working_dir=os.getcwd(), shell=True)
-
-def func_retry_agent(info, func, *args, **kwargs):
-    i = 0
-    num = info.get('retry_times', -1)
-    throw_if_fail = info.get('throw_if_fail')
-    while True:
-        try:
-            return func(*args, **kwargs)
-        except Exception:
-            logging.info('fails: try {}-th time'.format(i))
-            print_trace()
-            i = i + 1
-            if num > 0 and i >= num:
-                if throw_if_fail:
-                    raise
-                else:
-                    break
-            import random
-            import time
-            time.sleep(random.random() * 5)
-
-def limited_retry_agent(num, func, *args, **kwargs):
-    for i in range(num):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            logging.info('fails with \n{}: tried {}-th time'.format(
-                e,
-                i + 1))
-            import time
-            print_trace()
-            if i == num - 1:
-                raise
-            time.sleep(5)
-
-def retry_agent(func, *args, **kwargs):
-    return func_retry_agent(
-        {'retry_times': -1},
-        func, *args, **kwargs,
-    )
-
-def ensure_copy_folder(src_folder, dst_folder):
-    ensure_directory(dst_folder)
-    cmd_run('rsync -ravz {}/ {} --progress'.format(
-        src_folder, dst_folder).split(' '))
-
-def get_current_time_as_str():
-    return datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
-
-def iter_swap_param_simple(swap_params):
-    if isinstance(swap_params, dict):
-        swap_params = [[k, v] for k, v in swap_params.items()]
-    num = len(swap_params)
-    for p in swap_params:
-        if type(p[1]) is not list and type(p[1]) is not tuple:
-            p[1] = [p[1]]
-    counts = [len(p[1]) for p in swap_params]
-    assert all(c > 0 for c in counts)
-    idx = [0] * num
-
-    while True:
-        result = {}
-        for p, i in zip(swap_params, idx):
-            result[p[0]] = p[1][i]
-        yield result
-
-        for i in range(num - 1, -1, -1):
-            idx[i] = idx[i] + 1
-            if idx[i] < counts[i]:
-                break
-            else:
-                idx[i] = 0
-                if i == 0:
-                    return
-
-def iter_swap_param(swap_params):
-    if isinstance(swap_params, dict):
-        swap_params = [(k, v) for k, v in swap_params.items()]
-    num = len(swap_params)
-    for p in swap_params:
-        if type(p[1]) is not list and type(p[1]) is not tuple:
-            p[1] = [p[1]]
-    counts = [len(p[1]) for p in swap_params]
-    empty_keys = [k for k, vs in swap_params if len(vs) == 0]
-    assert len(empty_keys) == 0, empty_keys
-    idx = [0] * num
-
-    while True:
-        result = {}
-        for p, i in zip(swap_params, idx):
-            key = p[0]
-            value = p[1][i]
-            if isinstance(key, tuple):
-                for sub_key, sub_value in zip(key, value):
-                    dict_update_path_value(result, sub_key, sub_value)
-            else:
-                dict_update_path_value(result, key, value)
-        yield result
-
-        for i in range(num - 1, -1, -1):
-            idx[i] = idx[i] + 1
-            if idx[i] < counts[i]:
-                break
-            else:
-                idx[i] = 0
-                if i == 0:
-                    return
-
-def gen_uuid():
-    import uuid
-    return uuid.uuid4().hex
-
-def remove_dir(d):
-    ensure_remove_dir(d)
-
-def ensure_remove_file(d):
-    if op.isfile(d) or op.islink(d):
-        try:
-            os.remove(d)
-        except:
-            pass
-
-def ensure_remove_dir(d):
-    is_dir = op.isdir(d)
-    is_link = op.islink(d)
-    if is_dir:
-        if not is_link:
-            shutil.rmtree(d)
-        else:
-            os.unlink(d)
-
-def split_to_chunk(all_task, num_chunk=None, num_task_each_chunk=None):
-    if num_task_each_chunk is None:
-        num_task_each_chunk = (len(all_task) + num_chunk - 1) // num_chunk
-    result = []
-    i = 0
-    while True:
-        start = i * num_task_each_chunk
-        end = start + num_task_each_chunk
-        if start >= len(all_task):
-            break
-        if end > len(all_task):
-            end = len(all_task)
-        result.append(all_task[start:end])
-        i = i + 1
-    return result
-
-def hash_sha1(s):
-    import hashlib
-    if type(s) is not str:
-        s = pformat(s)
-    return hashlib.sha1(s.encode('utf-8')).hexdigest()
-
-def copy_file(src, dest):
-    tmp = dest + '.tmp'
-    # we use rsync because it could output the progress
-    cmd_run('rsync {} {} --progress'.format(src, tmp).split(' '))
-    os.rename(tmp, dest)
-
-def ensure_copy_file(src, dest):
-    ensure_directory(op.dirname(dest))
-    if not op.isfile(dest):
-        copy_file(src, dest)
-
-def decode_to_str(x):
-    try:
-        return x.decode('utf-8')
-    except UnicodeDecodeError:
-        return x.decode('latin-1')
-
-def cmd_run(list_cmd,
-            return_output=False,
-            env=None,
-            working_dir=None,
-            stdin=sp.PIPE,
-            shell=False,
-            dry_run=False,
-            silent=False,
-            process_input=None,
-            stdout=None,
-            ):
-    if not silent:
-        logging.info('start to cmd run: {}'.format(' '.join(map(str, list_cmd))))
-        if working_dir:
-            logging.info(working_dir)
-    # if we dont' set stdin as sp.PIPE, it will complain the stdin is not a tty
-    # device. Maybe, the reson is it is inside another process.
-    # if stdout=sp.PIPE, it will not print the result in the screen
-    e = os.environ.copy()
-    if 'SSH_AUTH_SOCK' in e:
-        del e['SSH_AUTH_SOCK']
-    if working_dir:
-        ensure_directory(working_dir)
-    if env:
-        for k in env:
-            e[k] = env[k]
-    if dry_run:
-        # we need the log result. Thus, we do not return at teh very beginning
-        return
-    if not return_output:
-        #if env is None:
-            #p = sp.Popen(list_cmd, stdin=sp.PIPE, cwd=working_dir)
-        #else:
-        p = sp.Popen(' '.join(list_cmd) if shell else list_cmd,
-                     stdin=stdin,
-                     env=e,
-                     shell=shell,
-                     stdout=stdout,
-                     cwd=working_dir)
-        message = p.communicate(input=process_input)
-        if p.returncode != 0:
-            raise ValueError(message)
-        return message
-    else:
-        if shell:
-            message = sp.check_output(' '.join(list_cmd),
-                    env=e,
-                    cwd=working_dir,
-                    shell=True)
-        else:
-            message = sp.check_output(list_cmd,
-                                      env=e,
-                                      cwd=working_dir,
-                                      )
-        if not silent:
-            logging.info('finished the cmd run')
-        return decode_to_str(message)
-
-def parallel_imap(func, all_task, num_worker=16):
-    if num_worker > 0:
-        #from multiprocessing import Pool
-        from pathos.multiprocessing import Pool
-        m = Pool(num_worker)
-        result = []
-        for x in qd_tqdm(m.imap(func, all_task), total=len(all_task)):
-            result.append(x)
-        # there are some error comes out from os.fork() and which says
-        # OSError: [Errno 24] Too many open files.
-        # self.pid = os.fork()
-        # here, we explicitly close the pool and see if it helps. note, this is
-        # not verified to work, if we still see that kind of error message, we
-        # need other solutions
-        m.close()
-        return result
-    else:
-        result = []
-        for t in all_task:
-            result.append(func(t))
-        return result
-
-def parallel_map(func, all_task, num_worker=16):
-    if num_worker > 0:
-        from pathos.multiprocessing import ProcessingPool as Pool
-        m = Pool(num_worker)
-        result = m.map(func, all_task)
-        m.close()
-        return result
-    else:
-        result = []
-        for t in all_task:
-            result.append(func(t))
-        return result
-
-def url_to_file_by_wget(url, fname):
-    ensure_directory(op.dirname(fname))
-    cmd_run(['wget', url, '-O', fname])
-
-@functools.lru_cache(maxsize=1)
-def logging_once(s):
-    logging.info(s)
-
-# this is specifically for azure blob url, where the last 1k bytes operation is
-# not supported. We have to first find the length and then find the start
-# point
-def get_url_fsize(url):
-    result = cmd_run(['curl', '-sI', url], return_output=True)
-    for row in result.split('\n'):
-        ss = [s.strip() for s in row.split(':')]
-        if len(ss) == 2 and ss[0] == 'Content-Length':
-            size_in_bytes = int(ss[1])
-            return size_in_bytes
-
-def url_to_file_by_curl(url, fname, bytes_start=None, bytes_end=None):
-    ensure_directory(op.dirname(fname))
-    if bytes_start == 0 and bytes_end == 0:
-        cmd_run(['touch', fname])
-        return
-    if bytes_start is None:
-        bytes_start = 0
-    elif bytes_start < 0:
-        size = get_url_fsize(url)
-        bytes_start = size + bytes_start
-        if bytes_start < 0:
-            bytes_start = 0
-    if bytes_end is None:
-        # -f: if it fails, no output will be sent to output file
-        if bytes_start == 0:
-            cmd_run(['curl', '-f',
-                url, '--output', fname])
-        else:
-            cmd_run(['curl', '-f', '-r', '{}-'.format(bytes_start),
-                url, '--output', fname])
-    else:
-        # curl: end is inclusive
-        cmd_run(['curl', '-f', '-r', '{}-{}'.format(bytes_start, bytes_end - 1),
-            url, '--output', fname])
-
-def url_to_bytes(url):
-    try:
-        fp = urlopen(url, timeout=30)
-        buf = fp.read()
-        real_url = fp.geturl()
-        if real_url != url and (not real_url.startswith('https') or
-                real_url.replace('https', 'http') != url):
-            logging.info('new url = {}; old = {}'.format(fp.geturl(), url))
-            # the image gets redirected, which means the image is not available
-            return None
-        return buf
-    except HTTPError as err:
-        logging.error("url: {}; error code {}; message: {}".format(
-            url, err.code, err.msg))
-        return None
-    except:
-        import traceback
-        logging.error("url: {}; unknown {}".format(
-            url, traceback.format_exc()))
-        return None
-
-def url_to_str(url):
-    try:
-        fp = urlopen(url, timeout=30)
-        buf = fp.read()
-        real_url = fp.geturl()
-        if real_url != url and (not real_url.startswith('https') or
-                real_url.replace('https', 'http') != url):
-            logging.info('new url = {}; old = {}'.format(fp.geturl(), url))
-            # the image gets redirected, which means the image is not available
-            return None
-        if type(buf) is str:
-            # py2
-            return buf
-        else:
-            # py3
-            return buf.decode()
-    except HTTPError as err:
-        logging.error("url: {}; error code {}; message: {}".format(
-            url, err.code, err.msg))
-        return None
-    except:
-        logging.error("url: {}; unknown {}".format(
-            url, traceback.format_exc()))
-        return None
-
-def image_url_to_bytes(url):
-    req = Request(url, headers={
-            "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
-    try:
-        response = urlopen(req, None, 10)
-        if response.code != 200:
-            logging.info("url: {}, error code: {}".format(url, response.code))
-            return None
-        data = response.read()
-        response.close()
-        return data
-    except Exception as e:
-        logging.info("error downloading: {}".format(e))
-    return None
-
-def str_to_image(buf):
-    image = np.asarray(bytearray(buf), dtype='uint8')
-    im = cv2.imdecode(image, cv2.IMREAD_COLOR)
-    return im
-
-def bytes_to_image(bs):
-    image = np.asarray(bytearray(bs), dtype='uint8')
-    return cv2.imdecode(image, cv2.IMREAD_COLOR)
-
-def url_to_image(url):
-    buf = url_to_bytes(url)
-    if buf is None:
-        return None
-    else:
-        image = np.asarray(bytearray(buf), dtype='uint8')
-        return cv2.imdecode(image, cv2.IMREAD_COLOR)
-
-def normalize_to_str(s):
-    if sys.version_info.major == 3:
-        return s
-    else:
-        if type(s) is str:
-            s = s.decode('unicode_escape')
-        import unicodedata
-        return unicodedata.normalize('NFKD', s).encode('ascii','ignore')
-
-def query_wiki_info(query_term):
-    query_term = '{} site:en.wikipedia.org'.format(query_term)
-    rls = limited_retry_agent(10, scrape_bing_general_rich, query_term, 1)
-    if not rls:
-        return {'query_term': query_term}
-    rl = rls[0]
-    n_title = normalize_to_str(rl['title'])
-    result = re.match('(.*) - Wikipedia', n_title)
-    if result:
-        best_name = result.groups()[0]
-    else:
-        best_name = query_term
-    result = {
-            'query_term': query_term,
-            'best_name': best_name,
-            'wiki_tile': rl['title'],
-            'norm_wiki_title': normalize_to_str(rl['title']),
-            'wiki_url': rl['url']}
-    #logging.info(pformat(result))
-    return result
-
-def scrape_bing_general_rich(query_term, depth):
-    '''
-    note, the order of url list is not the same as the web query. Even we keep
-    the order of how to add it to the result, the order is still not the same.
-    '''
-    # we might add duplicated terms. need 1) deduplicate, 2) keep the order
-    # when it is added
-    import xml.etree.ElementTree as ET
-
-    import requests
-    format_str = \
-            'http://www.bing.com/search?q={}&form=MONITR&qs=n&format=pbxml&first={}&count={}&fdpriority=premium&mkt=en-us'
-    start = 0
-    all_result = []
-    while True:
-        count = min(depth - start, 150)
-        if count <= 0:
-            break
-        query_str = format_str.format(query_term, start, count)
-        start = start + count
-        r = requests.get(query_str, allow_redirects=True)
-        content = r.content
-        #content = urllib2.urlopen(query_str).read()
-        root = ET.fromstring(content)
-        for t in root.iter('k_AnswerDataKifResponse'):
-            if t.text is None:
-                continue
-            text_result = json.loads(t.text)
-            if 'results' not in text_result:
-                continue
-            results = text_result['results']
-            for r in results:
-                rl = {k.lower() : r[k] for k in r}
-                if 'url' in rl and 'title' in rl and len(rl['title']) > 0:
-                    all_result.append(rl)
-    url_to_result = {}
-    for rl in all_result:
-        url = rl['url']
-        if url not in url_to_result:
-            url_to_result[url] = rl
-
-    return list(url_to_result.values())
-
-def request_by_browser(url):
-    import bs4
-    from selenium import webdriver
-    from selenium.webdriver.chrome.options import Options
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")
-    driver = webdriver.Chrome(options=chrome_options)
-    driver.get(url)
-    soup = bs4.BeautifulSoup(driver.page_source, features='lxml')
-    # if we return immediately, the page_source might not be ready
-    time.sleep(1)
-    soup = bs4.BeautifulSoup(driver.page_source, features='lxml')
-    return soup
-
-def iter_bing_visual_search(query_url, origin_url=True):
-    format_str = 'http://www.bing.com/images/searchbyimage?FORM=IRSBIQ&cbir=sbi&imgurl={0}'
-    # the following two parameters are not valid
-    #format_str += '&first=100'
-    #format_str += '&count=10'
-    bing_url = format_str.format(query_url)
-    soup = request_by_browser(bing_url)
-    html_keywords = ['richImage relImg', 'richImage relProd flyout']
-    alts = ['See related image detail', 'See related product detail']
-    caption_classes = ['span', 'a']
-    for html_key_word, alt, caption_class in zip(html_keywords, alts, caption_classes):
-        for i, container in enumerate(soup.find_all(class_= html_key_word)):
-            # one container has one image and one caption container. we will
-            # extract the image and the caption, which might be helpful in the
-            # future
-            info = {'rank': i}
-            info['html_keyword'] = html_key_word
-            # original url
-            if origin_url:
-                imgs = container.find_all(class_='richImgLnk')
-                if len(imgs) == 1:
-                    img = imgs[0]
-                    url = 'http://www.bing.com/images/search' + img.attrs['href']
-                    result = request_by_browser(url)
-                    imgs = result.find_all(alt='See the source image')
-                    if len(imgs) == 1:
-                        img = imgs[0]
-                        url = img.attrs['src']
-                        info['url'] = url
-
-            # bing cache image
-            imgs = container.find_all('img', alt=alt)
-            if len(imgs) == 1:
-                bing_cache_url = 'http://www.bing.com' + imgs[0].attrs['src']
-                info['bing_cache_url'] = bing_cache_url
-
-            captions = container.find_all(caption_class, class_='tit')
-            if len(captions) == 1:
-                cap = captions[0]
-                info['caption'] = cap.text
-            yield info
-
-def scrape_bing_general(query_term, depth):
-    '''
-    note, the order of url list is not the same as the web query. Even we keep
-    the order of how to add it to the result, the order is still not the same.
-    '''
-    # we might add duplicated terms. need 1) deduplicate, 2) keep the order
-    # when it is added
-    import xml.etree.ElementTree as ET
-
-    import requests
-    format_str = \
-            'http://www.bing.com/search?q={}&form=MONITR&qs=n&format=pbxml&first={}&count={}&fdpriority=premium&mkt=en-us'
-    start = 0
-    all_url = []
-    while True:
-        count = min(depth - start, 150)
-        if count <= 0:
-            break
-        query_str = format_str.format(query_term, start, count)
-        start = start + count
-        r = requests.get(query_str, allow_redirects=True)
-        content = r.content
-        #content = urllib2.urlopen(query_str).read()
-        root = ET.fromstring(content)
-        for t in root.iter('k_AnswerDataKifResponse'):
-            if t.text is None:
-                continue
-            text_result = json.loads(t.text)
-            if 'results' not in text_result:
-                continue
-            results = text_result['results']
-            for r in results:
-                rl = {k.lower() : r[k] for k in r}
-                if 'url' in rl:
-                    url = rl['url']
-                    all_url.append(url)
-    return list(set(all_url))
-
-def scrape_bing(query_term, depth, trans_bg=False):
-    ''' this is for image; for text, use scrape_bing_general
-    e.g. scrape_bing('elder person', 300)
-    '''
-    import xml.etree.ElementTree as ET
-
-    import requests
-    format_str = \
-            'http://www.bing.com/images/search?q={}&form=MONITR&qs=n&format=pbxml&first={}&count={}&fdpriority=premium&mkt=en-us'
-    start = 0
-    all_url = []
-    while True:
-        count = min(depth - start, 150)
-        if count <= 0:
-            break
-        query_str = format_str.format(query_term, start, count)
-        if trans_bg:
-            query_str += "&&qft=+filterui:photo-transparent"
-        start = start + count
-        logging.info(query_str)
-        r = requests.get(query_str, allow_redirects=True)
-        content = r.content
-        #content = urllib2.urlopen(query_str).read()
-        root = ET.fromstring(content)
-        for t in root.iter('k_AnswerDataKifResponse'):
-            results = json.loads(t.text)['results']
-            for r in results:
-                rl = {k.lower() : r[k] for k in r}
-                media_url = rl.get('mediaurl', '')
-                #url = rl.get('url', '')
-                #title = rl.get('title', '')
-                all_url.append(media_url)
-            break
-    return all_url
-
-
-def calculate_correlation_between_terms(iter1, iter2):
-    label_to_num1 = {}
-    label_to_num2 = {}
-    ll_to_num = {}
-
-    for (k1, str_rects1), (k2, str_rects2) in zip(iter1, iter2):
-        assert k1 == k2, 'keys should be aligned ({} != {})'.format(k1, k2)
-        rects1 = json.loads(str_rects1)
-        rects2 = json.loads(str_rects2)
-        for r in rects1:
-            c = r['class']
-            label_to_num1[c] = label_to_num1.get(c, 0) + 1
-        for r in rects2:
-            c = r['class']
-            label_to_num2[c] = label_to_num2.get(c, 0) + 1
-        for r1 in rects1:
-            for r2 in rects2:
-                i = calculate_iou(r1['rect'], r2['rect'])
-                if i > 0.01:
-                    k = (r1['class'], r2['class'])
-                    ll_to_num[k] = ll_to_num.get(k, 0) + i
-    ll_correlation = [(ll[0], ll[1], 1. * ll_to_num[ll] / (label_to_num1[ll[0]]
-        + label_to_num2[ll[1]] - ll_to_num[ll]))
-        for ll in ll_to_num]
-    ll_correlation = [(left, right, c) for left, right, c in ll_correlation
-            if left.lower() != right.lower()]
-    ll_correlation = sorted(ll_correlation, key=lambda x: -x[2])
-
-    return ll_correlation
-
-def json_dump(obj):
-    # order the keys so that each operation is deterministic though it might be
-    # slower
-    return json.dumps(obj, sort_keys=True, separators=(',', ':'))
-
-def set_if_not_exist(d, key, value):
-    if key not in d:
-        d[key] = value
-
-def print_as_html(table, html_output):
-    from jinja2 import Environment, FileSystemLoader
-    j2_env = Environment(loader=FileSystemLoader('./'), trim_blocks=True)
-    # find the cols with longest length. If it does not include all cols, then
-    # append those not included
-    _, cols = max([(len(table[row]), table[row]) for row in table],
-            key=lambda x: x[0])
-    cols = list(cols)
-    for row in table:
-        for c in table[row]:
-            if c not in cols:
-                cols.append(c)
-    r = j2_env.get_template('aux_data/html_template/table_viewer.html').render(
-        table=table,
-        rows=table.keys(),
-        cols=cols)
-    write_to_file(r, html_output)
-
-def jinja_render(template, **kwargs):
-    if len(kwargs) == 0:
-        return read_to_buffer(template).decode()
-    from jinja2 import Environment, FileSystemLoader
-    j2_env = Environment(loader=FileSystemLoader('./'), trim_blocks=True)
-    return j2_env.get_template(template).render(
-        **kwargs,
-    )
-
-def parse_general_args():
-    parser = argparse.ArgumentParser(description='General Parser')
-    parser.add_argument('-c', '--config_file', help='config file',
-            type=str)
-    parser.add_argument('-p', '--param', help='parameter string, yaml format',
-            type=str)
-    parser.add_argument('-bp', '--base64_param', help='base64 encoded yaml format',
-            type=str)
-    args = parser.parse_args()
-    kwargs =  {}
-    if args.config_file:
-        logging.info('loading parameter from {}'.format(args.config_file))
-        configs = load_from_yaml_file(args.config_file)
-        for k in configs:
-            kwargs[k] = configs[k]
-    if args.base64_param:
-        configs = load_from_yaml_str(base64.b64decode(args.base64_param))
-        for k in configs:
-            if k not in kwargs:
-                kwargs[k] = configs[k]
-            elif kwargs[k] == configs[k]:
-                continue
-            else:
-                logging.info('overwriting {} to {} for {}'.format(kwargs[k],
-                    configs[k], k))
-                kwargs[k] = configs[k]
-    if args.param:
-        configs = load_from_yaml_str(args.param)
-        dict_ensure_path_key_converted(configs)
-        for k in configs:
-            if k not in kwargs:
-                kwargs[k] = configs[k]
-            elif kwargs[k] == configs[k]:
-                continue
-            else:
-                logging.info('overwriting {} to {} for {}'.format(kwargs[k],
-                    configs[k], k))
-                kwargs[k] = configs[k]
-    return kwargs
-
-class ProgressBar(object):
-    def __init__(self, maxval):
-        assert maxval > 0
-        self.maxval = maxval
-
-    def __enter__(self):
-        self.pbar = progressbar.ProgressBar(maxval=self.maxval).start()
-        return self
-
-    def __exit__(self, t, v, traceback):
-        self.update(self.maxval)
-        sys.stdout.write('\n')
-
-    def update(self, i):
-        self.pbar.update(i)
-
-def concat_files(ins, out):
-    ensure_directory(op.dirname(out))
-    out_tmp = out + '.tmp'
-    with open(out_tmp, 'wb') as fp_out:
-        for i, f in enumerate(ins):
-            logging.info('concating {}/{} - {}'.format(i, len(ins), f))
-            with open(f, 'rb') as fp_in:
-                shutil.copyfileobj(fp_in, fp_out, 1024*1024*10)
-    os.rename(out_tmp, out)
-
-def get_mpi_rank():
-    if 'RANK' in os.environ:
-        return int(os.environ['RANK'])
-    return int(os.environ.get('OMPI_COMM_WORLD_RANK', '0'))
-
-def get_mpi_size():
-    if 'WORLD_SIZE' in os.environ:
-        return int(os.environ['WORLD_SIZE'])
-    return int(os.environ.get('OMPI_COMM_WORLD_SIZE', '1'))
-
-def get_mpi_local_rank():
-    if 'LOCAL_RANK' in os.environ:
-        return int(os.environ['LOCAL_RANK'])
-    return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', '0'))
-
-def get_mpi_local_size():
-    if 'LOCAL_SIZE' in os.environ:
-        return int(os.environ['LOCAL_SIZE'])
-    return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_SIZE', '1'))
-
-def load_class_ap(full_expid, predict_file):
-    # the normal map
-    report_file = op.splitext(predict_file)[0] + '.report'
-    fname = op.join('output', full_expid, 'snapshot', report_file +
-            '.class_ap.json')
-    if op.isfile(fname):
-        return json.loads(read_to_buffer(fname))
-
-    glob_pattern = op.splitext(predict_file)[0] + '.neg_aware_gmap*report'
-    fnames = glob.glob(op.join('output', full_expid, 'snapshot',
-        glob_pattern))
-    if len(fnames) > 0 and op.isfile(fnames[0]):
-        fname = fnames[0]
-        result = load_from_yaml_file(fname)
-        return {'overall': {'0.5': {'class_ap': result['class_ap']}}}
-
-
-def calculate_ap_by_true_list(corrects, total):
-    precision = (1. * np.cumsum(corrects)) / np.arange(1, 1 + len(corrects))
-    if np.sum(corrects) == 0:
-        return 0
-    return np.sum(precision * corrects) / total
-
-def calculate_ap_by_true_list_count_num(corrects, total):
-    precision = (1. * np.cumsum(corrects)) / np.arange(1, 1 + len(corrects))
-    if np.sum(corrects) == 0:
-        return 0
-    return np.sum(precision) / len(precision) * np.sum(corrects) / total
-
-def calculate_weighted_ap_by_true_list(corrects, weights, total):
-    precision = np.cumsum(corrects * weights) / (np.cumsum(weights) + 0.0001)
-    if total == 0:
-        return 0
-    return np.mean(precision) * np.sum(corrects) / total
-
-def calculate_ap_by_true_list_100(corrects, confs, total):
-    precision = (1. * np.cumsum(corrects)) / map(lambda x: 100. * (1 - x) + 1, confs)
-    return np.sum(precision * corrects) / total
-
-def calculate_image_ap_weighted(predicts, gts, weights):
-    corrects, _ = match_prediction_to_gt(predicts, gts)
-    return calculate_weighted_ap_by_true_list(corrects, weights, len(gts))
-
-def match_prediction_to_gt(predicts, gts, iou_th=0.5):
-    matched = [False] * len(gts)
-    corrects = np.zeros(len(predicts))
-    match_idx = [-1] * len(predicts)
-    for j, p in enumerate(predicts):
-        idx_gts = [(i, g) for i, g in enumerate(gts) if not matched[i]]
-        if len(idx_gts) == 0:
-            # max does not support empty input
-            continue
-        idx_gt_ious = [(i, g, calculate_iou(p, g)) for i, g in idx_gts]
-        max_idx, _, max_iou = max(idx_gt_ious, key=lambda x: x[-1])
-        if max_iou > iou_th:
-            matched[max_idx] = True
-            corrects[j] = 1
-            match_idx[j] = max_idx
-    return corrects, match_idx
-
-def calculate_image_ap(predicts, gts, count_num=False):
-    '''
-    a list of rects, use 2 to return more info
-    '''
-    corrects, _ = match_prediction_to_gt(predicts, gts)
-    if not count_num:
-        return calculate_ap_by_true_list(corrects, len(gts))
-    else:
-        return calculate_ap_by_true_list_count_num(corrects, len(gts))
-
-
-def calculate_image_ap2(predicts, gts):
-    '''
-    a list of rects
-    '''
-    corrects, match_idx = match_prediction_to_gt(predicts, gts)
-    return calculate_ap_by_true_list(corrects, len(gts)), match_idx
-
-def get_parameters_by_full_expid(full_expid):
-    yaml_file = op.join('output', full_expid, 'parameters.yaml')
-    if not op.isfile(yaml_file):
-        return None
-    param = load_from_yaml_file(yaml_file)
-    if 'data' not in param:
-        param['data'], param['net'] = parse_data_net(full_expid,
-                param['expid'])
-    return param
-
-def get_all_model_expid():
-    names = os.listdir('./output')
-    return names
-
-def get_target_images(predicts, gts, cat, threshold):
-    image_aps = []
-    for key in predicts:
-        rects = predicts[key]
-        curr_gt = [g for g in gts[key] if cat == 'any' or g['class'] == cat]
-        curr_pred = [p for p in predicts[key] if cat == 'any' or (p['class'] == cat and
-                p['conf'] > threshold)]
-        if len(curr_gt) == 0 and len(curr_pred) == 0:
-            continue
-        curr_pred = sorted(curr_pred, key=lambda x: -x['conf'])
-        ap = calculate_image_ap([p['rect'] for p in curr_pred],
-                [g['rect'] for g in curr_gt])
-        image_aps.append((key, ap))
-    image_aps = sorted(image_aps, key=lambda x: x[1])
-    #image_aps = sorted(image_aps, key=lambda x: -x[1])
-    target_images = [key for key, ap in image_aps]
-    return target_images, image_aps
-
-def readable_confusion_entry(entry):
-    '''
-    entry: dictionary, key: label, value: count
-    '''
-    label_count = [(label, entry[label]) for label in entry]
-    label_count.sort(key=lambda x: -x[1])
-    total = sum([count for label, count in label_count])
-    percent = [1. * count / total for label, count in label_count]
-    cum_percent = np.cumsum(percent)
-    items = []
-    for i, (label, count) in enumerate(label_count):
-        if i >= 5:
-            continue
-        items.append((label, '{}'.format(count), '{:.1f}'.format(100. *
-            percent[i]),
-            '{:.1f}'.format(100. * cum_percent[i])))
-    return items
-
-def get_all_tree_data():
-    names = sorted(os.listdir('./data'))
-    return [name for name in names
-        if op.isfile(op.join('data', name, 'root_enriched.yaml'))]
-
-def parse_test_data_with_version_with_more_param(predict_file):
-    pattern = \
-        'model(?:_iter)?_-?[0-9]*[e]?\.(?:caffemodel|pth\.tar|pth|pt)\.(.*)\.(trainval|train|test|train_[0-9]*_[0-9]*)\..*?(\.v[0-9])?\.(?:predict|report)'
-    match_result = re.match(pattern, predict_file)
-    if match_result:
-        assert match_result
-        result = match_result.groups()
-        if result[2] is None:
-            v = 0
-        else:
-            v = int(result[2][2])
-        return result[0], result[1], v
-
-def parse_test_data_with_version(predict_file):
-    # with version
-    result = parse_test_data_with_version_with_more_param(predict_file)
-    if result is not None:
-        return result
-    #model_iter_0040760.TaxCaptionBot.trainval.predict.report
-    all_pattern = [
-        'model(?:_iter)?_-?[0-9]*[e]?\.(?:caffemodel|pth\.tar|pth|pt)\.(.*)\.(trainval|train|test|val)\.(\.v[0-9])?(?:predict|report)',
-        'model_iter_[0-9]*\.(.*)\.(trainval|train|test|val)\..*predict\.(?:ir_acc|caption|vqa_acc)\.report',
-        'model(?:_iter)?_-?[0-9]*[e]?\.(.*)\.(trainval|train|test|val)\.(\.v[0-9])?.*(?:predict|report|tsv)',
-        'model_iter_[0-9]*\.(.*)\.([a-zA-Z0-9]+)\..*predict\.(?:ir_acc|caption|vqa_acc)\.report',
-    ]
-    for p in all_pattern:
-        match_result = re.match(p, predict_file)
-        if match_result is not None:
-            result = match_result.groups()
-            if len(result) == 2 or result[2] is None:
-                v = 0
-            else:
-                v = int(result[2][2:])
-            return result[0], result[1], v
-
-    pattern = \
-        'model(?:_iter)?_-?[0-9]*[e]?\.(?:caffemodel|pth\.tar|pth|pt)\.([^\.]*).*?(\.v[0-9])?\.(?:predict|report)'
-    match_result = re.match(pattern, predict_file)
-    assert match_result
-    result = match_result.groups()
-    if result[1] is None:
-        v = 0
-    else:
-        v = int(result[1][2:])
-    return result[0], 'test', v
-
-def parse_test_data(predict_file):
-    return parse_test_data_with_version(predict_file)[:2]
-
-def parse_data(full_expid):
-    all_data = os.listdir('data/')
-    candidates = [data for data in all_data if full_expid.startswith(data)]
-    max_length = max([len(c) for c in candidates])
-    return [c for c in candidates if len(c) == max_length][0]
-
-def parse_iteration(file_name):
-    patterns = [
-        '.*model(?:_iter)?_([0-9]*)\..*',
-        '.*model(?:_iter)?_([0-9]*)e\..*',
-        '.*model(?:_iter)?_([0-9]*)$',
-    ]
-    for p in patterns:
-        r = re.match(p, file_name)
-        if r is not None:
-            return int(float(r.groups()[0]))
-    logging.info('unable to parse the iterations for {}'.format(file_name))
-    return -2
-
-def parse_snapshot_rank(predict_file):
-    '''
-    it could be iteration, or epoch
-    '''
-    pattern = 'model_iter_([0-9]*)e*\.|model_([0-9]*)e*\.pth'
-    match_result = re.match(pattern, predict_file)
-    if match_result is None:
-        return -1
-    else:
-        matched_iters = [r for r in match_result.groups() if r is not None]
-        assert len(matched_iters) == 1
-        return int(matched_iters[0])
-
-def get_all_predict_files(full_expid):
-    model_folder = op.join('output', full_expid, 'snapshot')
-
-    predict_files = []
-
-    found = glob.glob(op.join(model_folder, '*.predict'))
-    predict_files.extend([op.basename(f) for f in found])
-
-    found = glob.glob(op.join(model_folder, '*.predict.tsv'))
-    predict_files.extend([op.basename(f) for f in found])
-
-    iterations = [(parse_snapshot_rank(p), p) for p in predict_files]
-    iterations.sort(key=lambda x: -x[0])
-    return [p for i, p in iterations]
-
-def dict_to_list(d, idx):
-    result = []
-    for k in d:
-        vs = d[k]
-        for v in vs:
-            try:
-                r = []
-                # if v is a list or tuple
-                r.extend(v[:idx])
-                r.append(k)
-                r.extend(v[idx: ])
-            except TypeError:
-                r = []
-                if idx == 0:
-                    r.append(k)
-                    r.append(v)
-                else:
-                    assert idx == 1
-                    r.append(v)
-                    r.append(k)
-            result.append(r)
-    return result
-
-def list_to_dict_unique(l, idx):
-    result = list_to_dict(l, idx)
-    for key in result:
-        result[key] = list(set(result[key]))
-    return result
-
-def list_to_dict(l, idx, keep_one=False):
-    result = OrderedDict()
-    for x in l:
-        if x[idx] not in result:
-            result[x[idx]] = []
-        y = x[:idx] + x[idx + 1:]
-        if not keep_one and len(y) == 1:
-            y = y[0]
-        result[x[idx]].append(y)
-    return result
-
-def generate_lineidx(filein, idxout):
-    idxout_tmp = idxout + '.tmp'
-    with open(filein, 'r') as tsvin, open(idxout_tmp,'w') as tsvout:
-        fsize = os.fstat(tsvin.fileno()).st_size
-        fpos = 0
-        fbar_last_pos = 0
-        fbar = qd_tqdm(total=fsize, unit_scale=True)
-        while fpos!=fsize:
-            tsvout.write(str(fpos)+"\n");
-            tsvin.readline()
-            fpos = tsvin.tell();
-            fbar.update(fpos - fbar_last_pos)
-            fbar_last_pos = fpos
-    os.rename(idxout_tmp, idxout)
-
-def drop_second_batch_in_bn(net):
-    assert net.layer[0].type == 'TsvBoxData'
-    assert net.layer[1].type == 'TsvBoxData'
-    slice_batch_layers = [l for l in net.layer if l.name == 'slice_batch']
-    assert len(slice_batch_layers) == 1
-    slice_batch_layer = slice_batch_layers[0]
-    slice_point = slice_batch_layer.slice_param.slice_point[0]
-
-    for i, l in enumerate(net.layer):
-        if l.type == 'BatchNorm':
-            top_name = l.top[0]
-            top_name2 = top_name + '_n'
-            l.top[0] = top_name2
-            for m in net.layer[i + 1:]:
-                for j, b in enumerate(m.bottom):
-                    if b == top_name:
-                        m.bottom[j] = top_name2
-                for j, t in enumerate(m.top):
-                    if t == top_name:
-                        m.top[j] = top_name2
-
-    all_origin_layer = []
-    for l in net.layer:
-        all_origin_layer.append(l)
-    all_layer = []
-    for l in all_origin_layer:
-        if l.type != 'BatchNorm':
-            all_layer.append(l)
-            continue
-        bn_input = l.bottom[0]
-        bn_output = l.top[0]
-
-        slice_layer = net.layer.add()
-        slice_layer.name = l.name + '/slice'
-        slice_layer.type = 'Slice'
-        slice_layer.bottom.append(bn_input)
-        slice_layer.top.append(l.name + '/slice0')
-        slice_layer.top.append(l.name + '/slice1')
-        slice_layer.slice_param.axis = 0
-        slice_layer.slice_param.slice_point.append(slice_point)
-        all_layer.append(slice_layer)
-
-        l.bottom.remove(l.bottom[0])
-        l.bottom.append(l.name + '/slice0')
-        l.top.remove(l.top[0])
-        l.top.append(l.name + '/slice0')
-        all_layer.append(l)
-
-        fix_bn_layer = net.layer.add()
-        fix_bn_layer.name = l.name + '/bn1'
-        fix_bn_layer.bottom.append(l.name + '/slice1')
-        fix_bn_layer.top.append(l.name + '/slice1')
-        fix_bn_layer.type = 'BatchNorm'
-        for _ in range(3):
-            p = fix_bn_layer.param.add()
-            p.lr_mult = 0
-            p.decay_mult = 0
-        fix_bn_layer.batch_norm_param.use_global_stats = True
-        all_layer.append(fix_bn_layer)
-
-        cat_layer = net.layer.add()
-        cat_layer.name = l.name + '/concat'
-        cat_layer.type = 'Concat'
-        cat_layer.bottom.append(l.name + '/slice0')
-        cat_layer.bottom.append(l.name + '/slice1')
-        cat_layer.top.append(bn_output)
-        cat_layer.concat_param.axis = 0
-        all_layer.append(cat_layer)
-
-    while len(net.layer) > 0:
-        net.layer.remove(net.layer[0])
-    net.layer.extend(all_layer)
-
-def fix_net_bn_layers(net, num_bn_fix):
-    for l in net.layer:
-        if l.type == 'BatchNorm':
-            if num_bn_fix > 0:
-                l.batch_norm_param.use_global_stats = True
-                num_bn_fix = num_bn_fix - 1
-            else:
-                break
-
-def is_cluster(ssh_info):
-    return '-p' in ssh_info and '-i' not in ssh_info
-
-def visualize_net(net):
-    delta = 0.000001
-    data_values = []
-    for key in net.blobs:
-        data_value = np.mean(np.abs(net.blobs[key].data))
-        data_values.append(data_value + delta)
-    diff_values = []
-    for key in net.blobs:
-        diff_values.append(np.mean(np.abs(net.blobs[key].diff))
-            + delta)
-    param_keys = []
-    param_data = []
-    for key in net.params:
-        for i, b in enumerate(net.params[key]):
-            param_keys.append('{}_{}'.format(key, i))
-            param_data.append(np.mean(np.abs(b.data)) + delta)
-    param_diff = []
-    for key in net.params:
-        for i, b in enumerate(net.params[key]):
-            param_diff.append(np.mean(np.abs(b.diff)) + delta)
-
-    xs = range(len(net.blobs))
-    plt.gcf().clear()
-    plt.subplot(2, 1, 1)
-
-    plt.semilogy(xs, data_values, 'r-o')
-    plt.semilogy(xs, diff_values, 'b-*')
-    plt.xticks(xs, net.blobs.keys(), rotation='vertical')
-    plt.grid()
-
-    plt.subplot(2, 1, 2)
-    xs = range(len(param_keys))
-    plt.semilogy(xs, param_data, 'r-o')
-    plt.semilogy(xs, param_diff, 'b-*')
-    plt.xticks(xs, param_keys, rotation='vertical')
-    plt.grid()
-    plt.draw()
-    plt.pause(0.001)
-
-def visualize_train(solver):
-    plt.figure()
-    features = []
-    for i in range(100):
-        visualize_net(solver.net)
-        solver.step(10)
-
-def network_input_to_image(data, mean_value, std_value=[1.0,1.0,1.0]):
-    all_im = []
-    for d in data:
-        im = (d.transpose((1, 2, 0))
-            * np.asarray(std_value).reshape(1, 1, 3)
-            + np.asarray(mean_value).reshape(1, 1, 3)
-            ).astype(np.uint8).copy()
-        all_im.append(im)
-    return all_im
-
-def remove_data_augmentation(data_layer):
-    assert data_layer.type == 'TsvBoxData'
-    data_layer.box_data_param.jitter = 0
-    data_layer.box_data_param.hue = 0
-    data_layer.box_data_param.exposure = 1
-    data_layer.box_data_param.random_scale_min = 1
-    data_layer.box_data_param.random_scale_max = 1
-    data_layer.box_data_param.fix_offset = True
-    data_layer.box_data_param.saturation = True
-
-def check_best_iou(biases, gt_w, gt_h, n):
-    def iou(gt_w, gt_h, w, h):
-        inter = min(gt_w, w) * min(gt_h, h)
-        return inter / (gt_w * gt_h + w * h - inter)
-
-    best_iou = -1
-    best_n = -1
-    for i in range(len(biases) / 2):
-        u = iou(gt_w, gt_h, biases[2 * i], biases[2 * i + 1])
-        if u > best_iou:
-            best_iou = u
-            best_n = i
-    assert best_n == n
-
-def calculate_iou1(rect0, rect1):
-    '''
-    x0, y1, x2, y3
-    '''
-    w = min(rect0[2], rect1[2]) - max(rect0[0], rect1[0]) + 1
-    if w < 0:
-        return 0
-    h = min(rect0[3], rect1[3]) - max(rect0[1], rect1[1]) + 1
-    if h < 0:
-        return 0
-    i = w * h
-    a1 = (rect1[2] - rect1[0] + 1) * (rect1[3] - rect1[1] + 1)
-    a0 = (rect0[2] - rect0[0] + 1) * (rect0[3] - rect0[1] + 1)
-    if a0 == 0 and a1 == 0 and i == 0:
-        return 1.
-    return 1. * i / (a0 + a1 - i)
-
-def calculate_iou_xywh(r0, r1):
-    r0 = [r0[0] - r0[2] / 2.,
-            r0[1] - r0[3] / 2.,
-            r0[0] + r0[2] / 2.,
-            r0[1] + r0[3] / 2.]
-    r1 = [r1[0] - r1[2] / 2.,
-            r1[1] - r1[3] / 2.,
-            r1[0] + r1[2] / 2.,
-            r1[1] + r1[3] / 2.]
-
-    return calculate_iou(r0, r1)
-
-def calculate_iou(rect0, rect1):
-    '''
-    x0, y1, x2, y3
-    '''
-    w = min(rect0[2], rect1[2]) - max(rect0[0], rect1[0])
-    if w < 0:
-        return 0
-    h = min(rect0[3], rect1[3]) - max(rect0[1], rect1[1])
-    if h < 0:
-        return 0
-    i = w * h
-    a1 = (rect1[2] - rect1[0]) * (rect1[3] - rect1[1])
-    a0 = (rect0[2] - rect0[0]) * (rect0[3] - rect0[1])
-    if a0 == 0 and a1 == 0 and i == 0:
-        return 1.
-    return 1. * i / (a0 + a1 - i)
-
-#def process_run(func, *args, **kwargs):
-    #def internal_func(queue):
-        #result = func(*args, **kwargs)
-        #queue.put(result)
-    #queue = mp.Queue()
-    #p = Process(target=internal_func, args=(queue,))
-    #p.start()
-    #p.join()
-    #assert p.exitcode == 0
-    #return queue.get()
-
-class ExceptionWrapper(object):
-    def __init__(self, m):
-        self.message = m
-
-def process_run(func, *args, **kwargs):
-    def internal_func(queue):
-        try:
-            result = func(*args, **kwargs)
-            queue.put(result)
-        except Exception:
-            queue.put(ExceptionWrapper(traceback.format_exc()))
-    queue = mp.Queue()
-    p = Process(target=internal_func, args=(queue,))
-    p.start()
-    result = queue.get()
-    p.join()
-    if isinstance(result, ExceptionWrapper):
-        raise Exception(result.message)
-    return result
-
-def setup_yaml():
-    """ https://stackoverflow.com/a/8661021 """
-    represent_dict_order = lambda self, data:  self.represent_mapping('tag:yaml.org,2002:map', data.items())
-    yaml.add_representer(OrderedDict, represent_dict_order)
-    try:
-        yaml.add_representer(unicode, unicode_representer)
-    except NameError:
-        logging.info('python 3 env')
-
-def init_logging():
-    np.seterr(divide = "raise", over="warn", under="warn",  invalid="raise")
-
-    ch = logging.StreamHandler(stream=sys.stdout)
-    ch.setLevel(logging.INFO)
-    logger_fmt = logging.Formatter('%(asctime)s.%(msecs)03d %(process)d:%(filename)s:%(lineno)s %(funcName)10s(): %(message)s')
-    ch.setFormatter(logger_fmt)
-
-    root = logging.getLogger()
-    root.handlers = []
-    root.addHandler(ch)
-    root.setLevel(logging.INFO)
-
-    setup_yaml()
-
-def ensure_directory(path):
-    if path == '' or path == '.':
-        return
-    if path != None and len(path) > 0:
-        assert not op.isfile(path), '{} is a file'.format(path)
-        if not os.path.exists(path) and not op.islink(path):
-            try:
-                os.makedirs(path)
-            except:
-                if os.path.isdir(path):
-                    # another process has done makedir
-                    pass
-                else:
-                    raise
-
-def parse_pattern(pattern, s):
-    result = parse_pattern_as_is(pattern, s)
-    if result is None:
-        return
-    return [float(g) for g in result]
-
-def parse_pattern_as_is(pattern, s):
-    result = re.search(pattern, s)
-    if result is None:
-        return result
-    return [g for g in result.groups()]
-
-def iter_match_document(pattern, fname):
-    for line in read_lines(fname):
-        result = parse_pattern_as_is(pattern, line)
-        if result is None:
-            continue
-        yield result
-
-def parse_yolo_log(log_file):
-    pattern = 'loss_xy: ([0-9, .]*); loss_wh: ([0-9, .]*); '
-    pattern = pattern + 'loss_objness: ([0-9, .]*); loss_class: ([0-9, .]*)'
-
-    base_log_lines = read_lines(log_file)
-    xys = []
-    whs = []
-    loss_objnesses = []
-    loss_classes = []
-    for line in base_log_lines:
-        gs = parse_pattern(pattern, line)
-        if gs is None:
-            continue
-        idx = 0
-        xys.append(float(gs[idx]))
-        idx = idx + 1
-        whs.append(float(gs[idx]))
-        idx = idx + 1
-        loss_objnesses.append(float(gs[idx]))
-        idx = idx + 1
-        loss_classes.append(float(gs[idx]))
-
-    return xys, whs, loss_objnesses, loss_classes
-
-def parse_nums(p, log_file):
-    result = []
-    for line in read_lines(log_file):
-        gs = parse_pattern(p, line)
-        if gs is None:
-            continue
-        result.append(gs)
-    return result
-
-def parse_yolo_log_st(log_file):
-    p = 'region_loss_layer\.cpp:1138] ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*)'
-    ss = parse_nums(p, log_file)
-    p = 'region_loss_layer\.cpp:1140] ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*), ([0-9]*)'
-    tt = parse_nums(p, log_file)
-    return ss, tt
-
-def parse_yolo_log_acc(log_file):
-    p = 'Region Avg IOU: ([0-9, .]*), Class: ([0-9, .]*), '
-    p = p + 'Obj: ([0-9, .]*), No Obj: ([0-9, .]*), Avg Recall: ([0-9, .]*),  count: ([0-9]*)'
-    all_ious = []
-    all_probs = []
-    all_obj = []
-    all_noobj = []
-    all_recall = []
-    all_count = []
-    for line in read_lines(log_file):
-        gs = parse_pattern(p, line)
-        if gs is None:
-            continue
-        all_ious.append(gs[0])
-        all_probs.append(gs[1])
-        all_obj.append(gs[2])
-        all_noobj.append(gs[3])
-        all_recall.append(gs[4])
-        all_count.append(gs[5])
-    return all_ious, all_probs, all_obj, all_noobj, all_recall, all_count
-
-
-def read_lines(file_name, **kwargs):
-    with open(file_name, 'r', **kwargs) as fp:
-        for line in fp:
-            yield line
-
-def read_to_buffer(file_name):
-    with open(file_name, 'rb') as fp:
-        all_line = fp.read()
-    return all_line
-
-class Model(object):
-    def __init__(self, test_proto_file, train_proto_file, model_param, mean_value, scale, model_iter):
-        self.test_proto_file = test_proto_file
-        self.model_param = model_param
-        self.mean_value = mean_value
-        self.model_iter = model_iter
-        self.scale = scale
-        self.train_proto_file = train_proto_file
-
-def adjust_tree_prediction_threshold(n, tree_th):
-    found = False
-    for l in n.layer:
-        if l.type == 'SoftmaxTreePrediction':
-            found = True
-            l.softmaxtreeprediction_param.threshold = tree_th
-    assert found
-
-def remove_nms(n):
-    for l in n.layer:
-        if l.type == 'RegionOutput':
-            l.region_output_param.nms = -1
-        if l.type == 'RegionPrediction':
-            l.region_prediction_param.nms = -1
-
-def update_conv_channels(net, factor, skip):
-    c = 0
-    s = 0
-    for l in net.layer:
-        if l.type == 'Convolution':
-            if s < skip:
-                s = s + 1
-                continue
-            o = l.convolution_param.num_output
-            l.convolution_param.num_output = int(o * factor)
-            c = c + 1
-    logging.info('updated {} layers for channel factor'.format(c))
-
-def get_channel(net, blob_name):
-    for l in net.layer:
-        if l.type == 'Convolution':
-            assert len(l.top) == 1
-            if l.top[0] == blob_name:
-                return l.convolution_param.num_output
-    assert False, 'not found'
-
-def fix_net_parameters(net, last_fixed_param):
-    found = False
-    no_param_layers = set(['TsvBoxData', 'ReLU', 'Pooling', 'Reshape',
-            'EuclideanLoss', 'Sigmoid'])
-    unknown_layers = []
-    for l in net.layer:
-        if l.type == 'Convolution' or l.type == 'Scale':
-            if l.type == 'Convolution':
-                assert len(l.param) >= 1
-            else:
-                if len(l.param) == 0:
-                    p = l.param.add()
-                    p.lr_mult = 0
-                    p.decay_mult = 0
-                    if l.scale_param.bias_term:
-                        p = l.param.add()
-                        p.lr_mult = 0
-                        p.decay_mult = 0
-            for p in l.param:
-                p.lr_mult = 0
-                p.decay_mult = 0
-        elif l.type == 'BatchNorm':
-            l.batch_norm_param.use_global_stats = True
-        else:
-            if l.type not in no_param_layers:
-                unknown_layers.append(l.type)
-        if l.name == last_fixed_param:
-            for b in l.bottom:
-                l.propagate_down.append(False)
-            found = True
-            break
-    assert len(unknown_layers) == 0, ', '.join(unknown_layers)
-    assert found
-
-def set_no_bias(net, layer_name):
-    for l in net.layer:
-        if l.name == layer_name:
-            assert l.type == 'Convolution'
-            l.convolution_param.bias_term = False
-            if len(l.param) == 2:
-                del l.param[1]
-            else:
-                assert len(l.param) == 0
-            return
-    assert False
-
-def add_yolo_angular_loss_regularizer(net, **kwargs):
-    for l in net.layer:
-        if l.name == 'angular_loss':
-            logging.info('angular loss exists')
-            return
-    conf_layer = None
-    for l in net.layer:
-        if l.name == 'conf':
-            conf_layer = l
-            assert 'conf' in l.top
-    found_t_label = False
-    for l in net.layer:
-        if 't_label' in l.top:
-            found_t_label = True
-            break
-    assert conf_layer and found_t_label
-
-    conf_layer.param[0].name = 'conf_w'
-    CA = conf_layer.convolution_param.num_output
-    assert len(conf_layer.bottom) == 1
-    num_feature = get_channel(net, conf_layer.bottom[0])
-
-    param_layer = net.layer.add()
-    param_layer.name = 'param_conf_w'
-    param_layer.type = 'Parameter'
-    param_layer.parameter_param.shape.dim.append(CA)
-    param_layer.parameter_param.shape.dim.append(num_feature)
-    param_layer.parameter_param.shape.dim.append(1)
-    param_layer.parameter_param.shape.dim.append(1)
-    param_layer.top.append('conf_w')
-    p = param_layer.param.add()
-    p.name = 'conf_w'
-
-    layer = net.layer.add()
-    layer.name = 'angular_loss'
-    layer.type = 'Python'
-    layer.bottom.append(conf_layer.bottom[0])
-    layer.bottom.append('t_label')
-    layer.bottom.append('conf_w')
-    layer.python_param.module = 'kcenter_exp'
-    layer.python_param.layer = 'YoloAngularLossLayer'
-    layer.propagate_down.append(True)
-    layer.propagate_down.append(False)
-    layer.propagate_down.append(False)
-    layer.top.append('angular_loss')
-    weight = kwargs.get('yolo_angular_loss_weight', 1)
-    layer.loss_weight.append(weight)
-
-def add_yolo_low_shot_regularizer(net, low_shot_label_idx):
-    assert net.layer[-1].type == 'RegionLoss'
-    assert net.layer[-2].type == 'Convolution'
-    assert net.layer[-1].bottom[0] == net.layer[-2].top[0]
-    assert net.layer[-2].convolution_param.kernel_size[0] == 1
-    assert net.layer[-2].convolution_param.kernel_h == 0
-    assert net.layer[-2].convolution_param.kernel_w == 0
-
-    num_classes = net.layer[-1].region_loss_param.classes
-    num_anchor = len(net.layer[-1].region_loss_param.biases) / 2
-
-    param_dim1 = net.layer[-2].convolution_param.num_output
-    param_dim2 = get_channel(net, net.layer[-2].bottom[0])
-
-    # add the parameter name into the convolutional layer
-    last_conv_param_name = 'last_conv_param_low_shot'
-    net.layer[-2].param[0].name = last_conv_param_name
-
-    # add the parameter layer to expose the parameter
-    param_layer = net.layer.add()
-    param_layer.type = 'Parameter'
-    param_layer.name = 'param_last_conv'
-    param_layer.top.append(last_conv_param_name)
-    p = param_layer.param.add()
-    p.name = last_conv_param_name
-    p.lr_mult = 1
-    p.decay_mult = 1
-    param_layer.parameter_param.shape.dim.append(param_dim1)
-    param_layer.parameter_param.shape.dim.append(param_dim2)
-
-    # add the regularizer layer
-    reg_layer = net.layer.add()
-    reg_layer.type = 'Python'
-    reg_layer.name = 'equal_norm'
-    reg_layer.bottom.append(last_conv_param_name)
-    reg_layer.top.append('equal_norm')
-    reg_layer.loss_weight.append(1)
-    reg_layer.python_param.module = 'equal_norm_loss'
-    reg_layer.python_param.layer = 'YoloAlignNormToBaseLossLayer'
-    reg_param = {'num_classes': num_classes,
-            'low_shot_label_idx': low_shot_label_idx,
-            'num_anchor': num_anchor}
-    reg_layer.python_param.param_str = json.dumps(reg_param)
-
-def update_kernel_active(net, kernel_active, kernel_active_skip):
-    assert False, 'use update_kernel_active2'
-    c = 0
-    skipped = 0
-    logging.info('{}-{}'.format(kernel_active, kernel_active_skip));
-    for l in net.layer:
-        if l.type == 'Convolution':
-            if skipped < kernel_active_skip:
-                skipped = skipped + 1
-                logging.info('skiping to update active kernel')
-                continue
-            l.convolution_param.kernel_active = kernel_active
-            c = c + 1
-
-    logging.info('update {} layers'.format(c))
-
-
-def plot_to_file(xs, ys, file_name, **kwargs):
-    fig = plt.figure()
-    semilogy = kwargs.get('semilogy')
-    if all(isinstance(x, str) for x in xs):
-        xs2 = range(len(xs))
-        #plt.xticks(xs2, xs, rotation=15, ha='right')
-        plt.xticks(xs2, xs, rotation='vertical')
-        xs = xs2
-    if type(ys) is dict:
-        for key in ys:
-            if semilogy:
-                plt.semilogy(xs, ys[key], '-o')
-            else:
-                plt.plot(xs, ys[key], '-o')
-    else:
-        if semilogy:
-            plt.semilogy(xs, ys, '-o')
-        else:
-            plt.plot(xs, ys, '-o')
-    plt.grid()
-    if 'ylim' in kwargs:
-        plt.ylim(kwargs['ylim'])
-    ensure_directory(op.dirname(file_name))
-    plt.tight_layout()
-    # explicitly remove the file because philly does not support overwrite
-    if op.isfile(file_name):
-        try:
-            os.remove(file_name)
-        except:
-            logging.info('{} exists but could not be deleted'.format(
-                file_name))
-    fig.savefig(file_name)
-    plt.close(fig)
-
-def parse_training_time(log_file):
-    log = read_to_buffer(log_file)
-    all_time_cost = []
-    all_iters = []
-    for line in log.split('\n'):
-        m = re.match('.*Iteration.*iter\/s, ([0-9\.]*)s\/([0-9]*) iters.*', line)
-        if m:
-            r = m.groups()
-            time_cost = float(r[0])
-            iters = float(r[1])
-            all_iters.append(iters)
-            all_time_cost.append(time_cost)
-    return all_iters, all_time_cost
-
-def encode_expid(prefix, *args):
-    parts = [prefix]
-    for (t, a) in args:
-        p = ''
-        if a != None:
-            if type(a) == str:
-                a = a.replace(':', '_')
-            if t != None and len(t) > 0:
-                p = p + '_{}'.format(t)
-            p = p + '_{}'.format(a)
-        parts.append(p)
-    return ''.join(parts)
-
-def unicode_representer(dumper, uni):
-    node = yaml.ScalarNode(tag=u'tag:yaml.org,2002:str', value=uni)
-    return node
-
-def dump_to_yaml_bytes(context):
-    return yaml.dump(context, default_flow_style=False,
-            encoding='utf-8', allow_unicode=True)
-
-def dump_to_yaml_str(context):
-    return dump_to_yaml_bytes(context).decode()
-
-def write_to_yaml_file(context, file_name):
-    ensure_directory(op.dirname(file_name))
-    with open(file_name, 'w') as fp:
-        yaml.dump(context, fp, default_flow_style=False,
-                encoding='utf-8', allow_unicode=True)
-
-def load_from_yaml_str(s):
-    return yaml.load(s, Loader=yaml.UnsafeLoader)
-
-def load_from_yaml_file(file_name):
-    with open(file_name, 'r') as fp:
-        data = load_from_yaml_str(fp)
-    while isinstance(data, dict) and '_base_' in data:
-        b = op.join(op.dirname(file_name), data['_base_'])
-        result = load_from_yaml_file(b)
-        assert isinstance(result, dict)
-        del data['_base_']
-        all_key = get_all_path(data, with_list=False)
-        for k in all_key:
-            v = dict_get_path_value(data, k)
-            dict_update_path_value(result, k, v)
-        data = result
-    return data
-
-def write_to_file(contxt, file_name, append=False):
-    p = os.path.dirname(file_name)
-    ensure_directory(p)
-    if type(contxt) is str:
-        contxt = contxt.encode()
-    flag = 'wb'
-    if append:
-        flag = 'ab'
-    with open(file_name, flag) as fp:
-        fp.write(contxt)
-
-def load_list_file(fname):
-    with open(fname, 'r') as fp:
-        lines = fp.readlines()
-    result = [line.strip() for line in lines]
-    if len(result) > 0 and result[-1] == '':
-        result = result[:-1]
-    return result
-
-class LoopProcess(Process):
-    def __init__(self, group=None, target=None, name=None, args=(), kwargs={}):
-        '''
-        same signiture with Process.__init__
-        The process will keep running the function of target and will wait for
-        several seconds in between. This is useful to run some monitoring job
-        or regular job
-        '''
-        super(LoopProcess, self).__init__(group, target, name, args, kwargs)
-        self._exit = Event()
-
-    def run(self):
-        sleep_time = 5
-        while not self._exit.is_set():
-            if self._target:
-                self._target(*self._args, **self._kwargs)
-            time.sleep(sleep_time)
-
-    def init_shutdown(self):
-        self._exit.set()
-
-class PyTee(object):
-    def __init__(self, logstream, stream_name):
-        valid_streams = ['stderr','stdout'];
-        if  stream_name not in valid_streams:
-            raise IOError("valid stream names are %s" % ', '.join(valid_streams))
-        self.logstream =  logstream
-        self.stream_name = stream_name;
-    def __del__(self):
-        pass;
-    def write(self, data):  #tee stdout
-        self.logstream.write(data);
-        self.fstream.write(data);
-        self.logstream.flush();
-        self.fstream.flush();
-
-    def flush(self):
-        self.logstream.flush();
-        self.fstream.flush();
-
-    def __enter__(self):
-        if self.stream_name=='stdout' :
-            self.fstream   =  sys.stdout
-            sys.stdout = self;
-        else:
-            self.fstream   =  sys.stderr
-            sys.stderr = self;
-        self.fstream.flush();
-    def __exit__(self, _type, _value, _traceback):
-        if self.stream_name=='stdout' :
-            sys.stdout = self.fstream;
-        else:
-            sys.stderr = self.fstream;
-
-def parse_basemodel_with_depth(net):
-    '''
-    darknet19->darknet19
-    darknet19_abc->darknet19
-    '''
-    if '_' not in net:
-        return net
-    else:
-        i = net.index('_')
-        return net[: i]
-
-def worth_create(base, derived, buf_second=0):
-    if not op.isfile(base) and \
-            not op.islink(base) and \
-            not op.isdir(base):
-        return False
-    if os.path.isfile(derived) and \
-            os.path.getmtime(derived) > os.path.getmtime(base) - buf_second:
-        return False
-    else:
-        return True
-
-def basename_no_ext(file_name):
-    return op.splitext(op.basename(file_name))[0]
-
-def default_data_path(dataset):
-    '''
-    use TSVDataset instead
-    '''
-    proj_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)));
-    result = {}
-    data_root = os.path.join(proj_root, 'data', dataset)
-    result['data_root'] = data_root
-    result['source'] = os.path.join(data_root, 'train.tsv')
-    result['trainval'] = op.join(data_root, 'trainval.tsv')
-    result['test_source'] = os.path.join(data_root, 'test.tsv')
-    result['labelmap'] = os.path.join(data_root, 'labelmap.txt')
-    result['source_idx'] = os.path.join(data_root, 'train.lineidx')
-    result['test_source_idx'] = os.path.join(data_root, 'test.lineidx')
-    return result
-
-class FileProgressingbar:
-    fileobj = None
-    pbar = None
-    def __init__(self, fileobj, keyword='Test'):
-        fileobj.seek(0,os.SEEK_END)
-        flen = fileobj.tell()
-        fileobj.seek(0,os.SEEK_SET)
-        self.fileobj = fileobj
-        widgets = ['{}: '.format(keyword), progressbar.AnimatedMarker(),' ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]
-        self.pbar = progressbar.ProgressBar(widgets=widgets, maxval=flen).start()
-    def update(self):
-        self.pbar.update(self.fileobj.tell())
-
-def is_pil_image(image):
-    from PIL import Image
-    return isinstance(image, Image.Image)
-
-def encoded_from_img(im, quality=None, save_type=None):
-    if save_type is None:
-        save_type = 'jpg'
-    assert save_type in ['jpg', 'png']
-    if not is_pil_image(im):
-        if quality:
-            x = cv2.imencode('.{}'.format(save_type), im,
-                    (cv2.IMWRITE_JPEG_QUALITY, quality))[1]
-        else:
-            x = cv2.imencode('.{}'.format(save_type), im)[1]
-    else:
-        if save_type in ['jpg', None]:
-            save_type = 'JPEG'
-        import io
-        x = io.BytesIO()
-        im.save(x, format=save_type)
-        x = x.getvalue()
-    return base64.b64encode(x)
-
-def encode_image(im, quality=None):
-    if quality:
-        x = cv2.imencode('.jpg', im, (cv2.IMWRITE_JPEG_QUALITY, quality))[1]
-    else:
-        x = cv2.imencode('.jpg', im)[1]
-    return x.tobytes()
-
-def is_valid_image(im):
-    return im is not None and all(x != 0 for x in im.shape)
-
-def pilimg_from_base64(imagestring):
-    try:
-        import io
-        jpgbytestring = base64.b64decode(imagestring)
-        image = Image.open(io.BytesIO(jpgbytestring))
-        image = image.convert('RGB')
-        return image
-    except:
-        return None;
-
-def img_from_base64(imagestring):
-    try:
-        jpgbytestring = base64.b64decode(imagestring)
-        nparr = np.frombuffer(jpgbytestring, np.uint8)
-        r = cv2.imdecode(nparr, cv2.IMREAD_COLOR);
-        return r
-    except:
-        return None;
-
-def img_from_bytes(jpgbytestring):
-    try:
-        nparr = np.frombuffer(jpgbytestring, np.uint8)
-        r = cv2.imdecode(nparr, cv2.IMREAD_COLOR);
-        return r
-    except:
-        return None
-
-def img_from_base64_ignore_rotation(str_im):
-    jpgbytestring = base64.b64decode(str_im)
-    nparr = np.frombuffer(jpgbytestring, np.uint8)
-    im = cv2.imdecode(nparr, cv2.IMREAD_IGNORE_ORIENTATION);
-    return im
-
-def encode_decode_im(im, quality):
-    with BytesIO() as output:
-        im.save(output, 'JPEG', quality=quality)
-        im = Image.open(output).convert('RGB')
-    return im
-
-def int_rect(rect, enlarge_factor=1.0, im_h=None, im_w=None):
-    assert(len(rect) == 4)
-    left, top, right, bot = rect
-    rw = right - left
-    rh = bot - top
-
-    new_x = int(left + (1.0 - enlarge_factor) * rw / 2.0)
-    new_y = int(top + (1.0 - enlarge_factor) * rh / 2.0)
-    new_w = int(math.ceil(enlarge_factor * rw))
-    new_h = int(math.ceil(enlarge_factor * rh))
-    if im_h and im_w:
-        new_x = np.clip(new_x, 0, im_w)
-        new_y = np.clip(new_y, 0, im_h)
-        new_w = np.clip(new_w, 0, im_w - new_x)
-        new_h = np.clip(new_h, 0, im_h - new_y)
-
-    return list(map(int, [new_x, new_y, new_x + new_w, new_y + new_h]))
-
-def is_valid_rect(rect):
-    return len(rect) == 4 and rect[0] < rect[2] and rect[1] < rect[3]
-
-def pass_key_value_if_has(d_from, from_key, d_to, to_key):
-    if from_key in d_from:
-        d_to[to_key] = d_from[from_key]
-
-def dict_update_nested_dict(a, b, overwrite=True):
-    for k, v in viewitems(b):
-        if k not in a:
-            dict_update_path_value(a, k, v)
-        else:
-            if isinstance(dict_get_path_value(a, k), dict) and isinstance(v, dict):
-                dict_update_nested_dict(dict_get_path_value(a, k), v, overwrite)
-            else:
-                if overwrite:
-                    dict_update_path_value(a, k, v)
-
-def dict_ensure_path_key_converted(a):
-    for k in list(a.keys()):
-        v = a[k]
-        if '$' in k:
-            parts = k.split('$')
-            x = {}
-            x_curr = x
-            for p in parts[:-1]:
-                x_curr[p] = {}
-                x_curr = x_curr[p]
-            if isinstance(v, dict):
-                dict_ensure_path_key_converted(v)
-            x_curr[parts[-1]] = v
-            dict_update_nested_dict(a, x)
-            del a[k]
-        else:
-            if isinstance(v, dict):
-                dict_ensure_path_key_converted(v)
-
-def query_values_by_path_suffix(job_in_scheduler, suffix, default=None):
-    found = []
-    for p in get_all_path(job_in_scheduler, leaf_only=False):
-        if p.endswith(suffix):
-            v = dict_get_path_value(job_in_scheduler, p)
-            found.append(v)
-    return found
-
-def query_path_by_suffix(job_in_scheduler, suffix, default=None):
-    found = []
-    for p in get_all_path(job_in_scheduler, leaf_only=False):
-        if p.endswith(suffix):
-            v = dict_get_path_value(job_in_scheduler, p)
-            found.append(v)
-    if len(found) == 1:
-        return found[0]
-    elif len(found) > 1:
-        if all(f == found[0] for f in found[1:]):
-            return found[0]
-        else:
-            raise ValueError
-    else:
-        return default
-
-def get_all_path(d, with_type=False, leaf_only=True, with_list=True):
-    assert not with_type, 'will not support'
-    all_path = []
-
-    if isinstance(d, dict):
-        for k, v in d.items():
-            all_sub_path = get_all_path(
-                v, with_type, leaf_only=leaf_only, with_list=with_list)
-            all_path.extend([k + '$' + p for p in all_sub_path])
-            if not leaf_only or len(all_sub_path) == 0:
-                all_path.append(k)
-    elif (isinstance(d, tuple) or isinstance(d, list)) and with_list:
-        for i, _v in enumerate(d):
-            all_sub_path = get_all_path(
-                _v, with_type,
-                leaf_only=leaf_only,
-                with_list=with_list,
-            )
-            all_path.extend(['{}$'.format(i) + p for p in all_sub_path])
-            if not leaf_only or len(all_sub_path) == 0:
-                all_path.append('{}'.format(i))
-    return all_path
-
-def dict_get_all_path(d, with_type=False, leaf_only=True):
-    all_path = []
-    for k, v in viewitems(d):
-        if with_type:
-            if type(k) is str:
-                k = 's' + k
-            elif type(k) is int:
-                k = 'i' + str(k)
-            else:
-                raise NotImplementedError
-        if isinstance(v, dict):
-            all_sub_path = dict_get_all_path(
-                v, with_type, leaf_only=leaf_only)
-            all_path.extend([k + '$' + p for p in all_sub_path])
-            if not leaf_only:
-                all_path.append(k)
-        elif isinstance(v, tuple) or isinstance(v, list):
-            for i, _v in enumerate(v):
-                prefix = '' if not with_type else 'i'
-                if isinstance(_v, (dict, list)):
-                    all_sub_path = dict_get_all_path(
-                        _v, with_type,
-                        leaf_only=leaf_only)
-                    all_path.extend([k + '${}{}$'.format(prefix, i) + p for p in all_sub_path])
-                else:
-                    all_path.append(k + '${}{}'.format(prefix, i))
-            if not leaf_only:
-                all_path.append(k)
-        else:
-            all_path.append(k)
-    return all_path
-
-def dict_parse_key(k, with_type):
-    if with_type:
-        if k[0] == 'i':
-            return int(k[1:])
-        else:
-            return k[1:]
-    return k
-
-def dict_has_path(d, p, with_type=False):
-    ps = p.split('$')
-    cur_dict = d
-    while True:
-        if len(ps) > 0:
-            k = dict_parse_key(ps[0], with_type)
-            if isinstance(cur_dict, dict) and k in cur_dict:
-                cur_dict = cur_dict[k]
-                ps = ps[1:]
-            elif isinstance(cur_dict, list):
-                try:
-                    k = int(k)
-                except:
-                    return False
-                cur_dict = cur_dict[k]
-                ps = ps[1:]
-            else:
-                return False
-        else:
-            return True
-
-
-def dict_set_path_if_not_exist(param, k, v):
-    if not dict_has_path(param, k):
-        dict_update_path_value(param, k, v)
-        return True
-    else:
-        return False
-
-def dict_update_path_value(d, p, v):
-    ps = p.split('$')
-    while True:
-        if len(ps) == 1:
-            d[ps[0]] = v
-            break
-        else:
-            if ps[0] not in d:
-                d[ps[0]] = {}
-            d = d[ps[0]]
-            ps = ps[1:]
-
-def dict_remove_path(d, p):
-    ps = p.split('$')
-    assert len(ps) > 0
-    cur_dict = d
-    need_delete = ()
-    while True:
-        if len(ps) == 1:
-            if len(need_delete) > 0 and len(cur_dict) == 1:
-                del need_delete[0][need_delete[1]]
-            else:
-                del cur_dict[ps[0]]
-            return
-        else:
-            if len(cur_dict) == 1:
-                if len(need_delete) == 0:
-                    need_delete = (cur_dict, ps[0])
-            else:
-                need_delete = (cur_dict, ps[0])
-            cur_dict = cur_dict[ps[0]]
-            ps = ps[1:]
-
-def dict_get_path_value(d, p, with_type=False):
-    ps = p.split('$')
-    cur_dict = d
-    while True:
-        if len(ps) > 0:
-            k = dict_parse_key(ps[0], with_type)
-            if isinstance(cur_dict, (tuple, list)):
-                cur_dict = cur_dict[int(k)]
-            else:
-                cur_dict = cur_dict[k]
-            ps = ps[1:]
-        else:
-            return cur_dict
-
-def get_file_size(f):
-    return os.stat(f).st_size
-
-def convert_to_yaml_friendly(result):
-    if type(result) is dict:
-        for key, value in result.items():
-            if isinstance(value, dict):
-                result[key] = convert_to_yaml_friendly(value)
-            elif isinstance(value, np.floating):
-                result[key] = float(value)
-            elif isinstance(value, np.ndarray):
-                raise NotImplementedError()
-            elif type(value) in [int, str, float, bool]:
-                continue
-            else:
-                raise NotImplementedError()
-    else:
-        raise NotImplementedError()
-    return result
-
-def natural_key(text):
-    import re
-    result = []
-    for c in re.split(r'([0-9]+(?:[.][0-9]*)?)', text):
-        try:
-            result.append(float(c))
-        except:
-            continue
-    return result
-
-def natural_sort(strs, key=None):
-    if key is None:
-        strs.sort(key=natural_key)
-    else:
-        strs.sort(key=lambda s: natural_key(key(s)))
-
-def get_pca(x, com):
-    x -= np.mean(x, axis = 0)
-    cov = np.cov(x, rowvar=False)
-    from scipy import linalg as LA
-    evals , evecs = LA.eigh(cov)
-    total_val = np.sum(evals)
-    idx = np.argsort(evals)[::-1]
-    evecs = evecs[:,idx]
-    evals = evals[idx]
-    component_val = np.sum(evals[:com])
-    logging.info('kept: {}/{}={}'.format(component_val,
-            total_val, component_val / total_val))
-    a = np.dot(x, evecs[:, :com])
-    return a
-
-def plot_distribution(x, y, color=None, fname=None):
-    import seaborn as sns
-    x = sns.jointplot(x, y, kind='kde',
-            color=color)
-    if fname:
-        x.savefig(fname)
-        plt.close()
-    else:
-        plt.show()
-
-g_key_to_cache = {}
-def run_if_not_memory_cached(func, *args, **kwargs):
-    force = False
-    if '__force' in kwargs:
-        if kwargs['__force']:
-            force = True
-        del kwargs['__force']
-    if '__key' in kwargs:
-        key = kwargs.pop('__key')
-    else:
-        import pickle as pkl
-        key = hash_sha1(pkl.dumps(OrderedDict({'arg': pformat(args), 'kwargs':
-            pformat(kwargs), 'func_name': func.__name__})))
-    global g_key_to_cache
-
-    if key in g_key_to_cache and not force:
-        return g_key_to_cache[key]
-    else:
-        result = func(*args, **kwargs)
-        g_key_to_cache[key] = result
-        return result
-
-def run_if_not_cached(func, *args, **kwargs):
-    force = False
-    if '__force' in kwargs:
-        if kwargs['__force']:
-            force = True
-        del kwargs['__force']
-
-    import pickle as pkl
-    key = hash_sha1(pkl.dumps(OrderedDict({'arg': pformat(args), 'kwargs':
-        pformat(kwargs), 'func_name': func.__name__})))
-    cache_folder = op.expanduser('./output/run_if_not_cached/')
-    cache_file = op.join(cache_folder, key)
-
-    if op.isfile(cache_file) and not force:
-        return pkl.loads(read_to_buffer(cache_file))
-    else:
-        result = func(*args, **kwargs)
-        write_to_file(pkl.dumps(result), cache_file)
-        return result
-
-def convert_to_command_line(param, script):
-    logging.info(pformat(param))
-    x = copy.deepcopy(param)
-    result = "python {} -bp {}".format(
-            script,
-            base64.b64encode(dump_to_yaml_bytes(x)).decode())
-    return result
-
-def print_table(a_to_bs, all_key=None, latex=False, **kwargs):
-    if len(a_to_bs) == 0:
-        return
-    if not latex:
-        all_line = get_table_print_lines(a_to_bs, all_key)
-        logging.info('\n{}'.format('\n'.join(all_line)))
-    else:
-        from .latex_writer import print_simple_latex_table
-        if all_key is None:
-            all_key = list(set(a for a_to_b in a_to_bs for a in a_to_b))
-            all_key = sorted(all_key)
-        x = print_simple_latex_table(a_to_bs,
-                all_key, **kwargs)
-        logging.info('\n{}'.format(x))
-        return x
-
-def get_table_print_lines(a_to_bs, all_key):
-    if len(a_to_bs) == 0:
-        logging.info('no rows')
-        return []
-    if not all_key:
-        all_key = []
-        for a_to_b in a_to_bs:
-            all_key.extend(a_to_b.keys())
-        all_key = sorted(list(set(all_key)))
-    all_width = [max([len(str(a_to_b.get(k, ''))) for a_to_b in a_to_bs] +
-        [len(k)]) for k in all_key]
-    row_format = ' '.join(['{{:{}}}'.format(w) for w in all_width])
-
-    all_line = []
-    line = row_format.format(*all_key)
-    all_line.append(line.strip())
-    for a_to_b in a_to_bs:
-        line = row_format.format(*[str(a_to_b.get(k, '')) for k in all_key])
-        all_line.append(line)
-    return all_line
-
-def is_hvd_initialized():
-    try:
-        import horovod.torch as hvd
-        hvd.size()
-        return True
-    except ImportError:
-        return False
-    except ValueError:
-        return False
-
-def get_user_name():
-    import getpass
-    return getpass.getuser()
-
-def decode_general_cmd(extraParam):
-    re_result = re.match('.*python (?:scripts|src)/.*\.py -bp (.*)', extraParam)
-    if re_result and len(re_result.groups()) == 1:
-        ps = load_from_yaml_str(base64.b64decode(re_result.groups()[0]))
-        return ps
-
-def print_job_infos(all_job_info):
-    all_key = [
-        'cluster',
-        'status',
-        'appID-s',
-        'result',
-        'elapsedTime',
-        'elapsedFinished',
-        'mem_used',
-        'gpu_util',
-        'speed',
-        'left']
-    keys = ['data', 'net', 'expid']
-    meta_keys = ['num_gpu']
-    all_key.extend(keys)
-    all_key.extend(meta_keys)
-
-    # find the keys whose values are the same
-    def all_equal(x):
-        assert len(x) > 0
-        return all(y == x[0] for y in x[1:])
-
-    if len(all_job_info) > 1:
-        equal_keys = [k for k in all_key if all_equal([j.get(k) for j in all_job_info])]
-        if len(equal_keys) > 0:
-            logging.info('equal key values for all jobs')
-            print_table(all_job_info[0:1], all_key=equal_keys)
-        all_key = [k for k in all_key if not all_equal([j.get(k) for j in all_job_info])]
-
-    print_table(all_job_info, all_key=all_key)
-
-def parse_eta_in_hours(left):
-    pattern = '(?:([0-9]*) day[s]?, )?([0-9]*):([0-9]*):([0-9]*)'
-    result = re.match(pattern, left)
-    if result:
-        gs = result.groups()
-        gs = [float(g) if g else 0 for g in gs]
-        assert int(gs[0]) == gs[0]
-        days = int(gs[0])
-        hours = gs[1] + gs[2] / 60. + gs[3] / 3600
-        return days, hours
-    return -1, -1
-
-def attach_philly_maskrcnn_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        # Philly, maskrcnn-benchmark log
-        pattern = '([0-9\. :-]): .*: eta: (.*) iter: [0-9]*[ ]*speed: ([0-9\.]*).*'
-
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left, speed = result.groups()
-            job_info['speed'] = speed
-            from datetime import datetime, timezone
-            now = datetime.now(timezone.utc)
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            delay = (now - log_time).total_seconds()
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h({:.1f}s)'.format(d, h, delay)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def calc_eta(days, hours):
-    from datetime import timedelta
-    x = datetime.now() + timedelta(days=days, hours=hours + 1)
-    return '{}/{}-{}'.format(x.month, x.day, x.hour)
-
-def attach_mmask_caption_itp_multi_line_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'.*(202[0-9:\ \-\.]+)\,.* (?:train|mmask_caption|mmask_pretrain)\.py\:.*(?:train)\(\):.*eta[ ]*:(.*) iter: [0-9]*'
-            #2020-10-13 20:15:24 [21ddde126d88bf2a12f76f300ddaac02-ps-0, 10.43.224.4] [1,0]<stdout>:2020-10-13 20:15:23,329.329 mmask_caption.py:186      train(): eta : 3:37:16  iter: 2460  max mem : 4541
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left = result.groups()
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left.strip())
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_mmask_aml_multi_line_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'(202[0-9:\ \-\.]+)\,.* (?:mmask_caption|mmask_pretrain)\.py\:.*(?:train)\(\):.*eta[ ]*:(.*) iter: [0-9]*'
-            #2020-10-08 03:00:47,292.292 mmask_caption.py:186      train(): eta : 3:40:55  iter: 1900  max mem : 4541
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left = result.groups()
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left.strip())
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_itp_mmask_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'.*<stdout>:(.*),.* (?:trainer|mmask_pretrain)\.py.*(?:do_train|do_train_dict|train)\(\): eta: (.*) iter: [0-9]*.*'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left = result.groups()
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_itp_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'.*<stdout>:(.*),.*(?:base_trainer|trainer|mmask_pretrain|image_text_retrieval|vqa)\.py.*(?:do_train|_logistics|do_train_dict|train|old_train)\(\): eta:[ ]*(.*) iter: [0-9]*[ ]*speed: ([0-9\.]*).*'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left, speed = result.groups()
-            job_info['speed'] = speed
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_aml_maskrcnn_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'(.*),.* (?:mmask_pretrain|trainer|vqa)\.py.*(?:old_train|train|do_train|do_train_dict)\(\): eta: (.*) iter: [0-9]*[ ]*speed: ([0-9\.]*).*'
-
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left, speed = result.groups()
-            job_info['speed'] = speed
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_aml_detectron2_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        pattern = r'(.*),.* events\.py.*eta: (.*) iter: [0-9]*.*'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left = result.groups()
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def attach_philly_caffe_log_if_is(all_log, job_info):
-    for log in reversed(all_log):
-        # philly, caffe log
-        pattern = '.*solver\.cpp:[0-9]*] Iteration [0-9]* \(.* iter\/s, ([0-9\.]*s\/100) iters, left: ([0-9\.]*h)\), loss = [0-9\.]*'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            job_info['speed'], job_info['left'] = result.groups()
-            return True
-    return False
-
-def attach_gpu_utility_from_log(all_log, job_info):
-    for log in reversed(all_log):
-        # philly, caffe log aml_server or philly_server log
-        pattern = '.*_server.py:.*monitor.*\[(.*)\]'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            try:
-                all_info = json.loads('[{}]'.format(result.groups()[0].replace('\'', '\"')))
-                min_gpu_mem = min([i['mem_used'] for i in all_info])
-                max_gpu_mem = max([i['mem_used'] for i in all_info])
-                min_gpu_util = min([i['gpu_util'] for i in all_info])
-                max_gpu_util = max([i['gpu_util'] for i in all_info])
-                # GB
-                job_info['mem_used'] = '{}-{}'.format(round(min_gpu_mem/1024, 1),
-                        round(max_gpu_mem/1024., 1))
-                job_info['gpu_util'] = '{}-{}'.format(min_gpu_util, max_gpu_util)
-                return True
-            except:
-                return False
-    return False
-
-def attach_log_parsing_result(job_info):
-    # run unit test if modified
-    logs = job_info.get('latest_log')
-    if logs is None:
-        return
-    all_log = logs.split('\n')
-    del job_info['latest_log']
-    attach_gpu_utility_from_log(all_log, job_info)
-    if attach_itp_log_if_is(all_log, job_info):
-        return
-    if attach_philly_maskrcnn_log_if_is(all_log, job_info):
-        return
-    if attach_aml_maskrcnn_log_if_is(all_log, job_info):
-        return
-    if attach_philly_caffe_log_if_is(all_log, job_info):
-        return
-    if attach_aml_detectron2_log_if_is(all_log, job_info):
-        return
-    if attach_itp_mmask_log_if_is(all_log, job_info):
-        return
-    if attach_mmask_aml_multi_line_log_if_is(all_log, job_info):
-        return
-    if attach_mmask_caption_itp_multi_line_log_if_is(all_log, job_info):
-        return
-    # the following is designed to cover any examples
-    if attach_any_log(all_log, job_info):
-        return
-
-def attach_any_log(all_log, job_info):
-    # to check the correctness, run: py.test --ipdb src/qd/unittest/test_qd_common.py -k test_attach_any_log
-    for log in reversed(all_log):
-        pattern = r'([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}).*\.py.*\(\): eta:[ ]*(.*) iter: [0-9]*[ ]*speed: ([0-9\.]*).*'
-        result = re.match(pattern, log)
-        if result and result.groups():
-            log_time, left, speed = result.groups()
-            job_info['speed'] = speed
-            from dateutil.parser import parse
-            log_time = parse(log_time)
-            job_info['log_time'] = log_time
-            # log_time here is UTC. convert it to local time
-            d, h = parse_eta_in_hours(left)
-            job_info['left'] = '{}-{:.1f}h'.format(d, h)
-            job_info['eta'] = calc_eta(d, h)
-            return True
-    return False
-
-def print_offensive_folder(folder):
-    all_folder = os.listdir(folder)
-    name_to_size = {}
-    for i, f in enumerate(qd_tqdm(all_folder)):
-        sec = 60 * 10
-        f = op.join(folder, f)
-        size = run_if_not_cached(get_folder_size, f, sec)
-        name_to_size[f] = size
-        logging.info('{}: {}'.format(f, size))
-    logging.info(', '.join([op.basename(n) for n, s in name_to_size.items() if
-        s < 0]))
-
-def get_folder_size(f, sec):
-    cmd = ['du', '--max-depth=0', f]
-    import subprocess
-    from subprocess import check_output
-    try:
-        out = check_output(cmd, timeout=sec)
-    except subprocess.TimeoutExpired:
-        logging.info('{}'.format(f))
-        return -1
-    out = out.decode()
-    size = [x.strip() for x in out.split('\t')][0]
-    return int(size)
-
-class make_namespace_by_dict(object):
-    def __init__(self, d):
-        for k in d:
-            v = d[k]
-            if type(v) is dict:
-                self.__dict__[k] = make_namespace_by_dict(v)
-            else:
-                self.__dict__[k] = v
-    def clone(self):
-        c = copy.deepcopy(self.__dict__)
-        return make_namespace_by_dict(c)
-
-    def __repr__(self):
-        return '{}'.format(pformat(self.__dict__))
-
-@try_once
-def try_get_cpu_info():
-    command = 'cat /proc/cpuinfo'
-    return os.popen(command).read().strip()
-
-# ---------------------------------------------------- pytorch speed analysis
-def create_speed_node(info):
-    node = Tree()
-    node.add_features(**info)
-    return node
-
-def speed_tree_insert(root, node):
-    while True:
-        need_merge_nodes = [c for c in root.children
-                if is_child_parent(c.name, node.name)]
-        if len(need_merge_nodes) > 0:
-            for x in need_merge_nodes:
-                x.detach()
-            for x in need_merge_nodes:
-                node.add_child(x)
-            root.add_child(node)
-            return
-        go_deeper_nodes = [c for c in root.children if
-                is_child_parent(node.name, c.name)]
-        if len(go_deeper_nodes) == 0:
-            root.add_child(node)
-            return
-        else:
-            assert len(go_deeper_nodes) == 1
-            root = go_deeper_nodes[0]
-
-def is_child_parent(c, p):
-    if p == '':
-        return True
-    return c.startswith(p + '.')
-
-def speed_trees_insert(roots, info):
-    node = create_speed_node(info)
-    # we assume the name are not equal
-    need_merge_nodes = [r for r in roots
-            if is_child_parent(r.name, info['name'])]
-    if len(need_merge_nodes) > 0:
-        for x in need_merge_nodes:
-            node.add_child(x)
-            roots.remove(x)
-        roots.append(node)
-        return
-    need_insert_roots = [r for r in roots
-            if is_child_parent(info['name'], r.name)]
-    if len(need_insert_roots) == 0:
-        roots.append(node)
-    elif len(need_insert_roots) == 1:
-        speed_tree_insert(need_insert_roots[0], node)
-    else:
-        raise Exception()
-
-def build_speed_tree(component_speeds):
-    roots = []
-    for c in component_speeds:
-        speed_trees_insert(roots, c)
-    return roots
-
-def get_vis_str(component_speeds):
-    roots = build_speed_tree(component_speeds)
-    if len(roots) == 0:
-        return ''
-    assert len(roots) == 1, roots
-    root = roots[0]
-    for n in root.iter_search_nodes():
-        n.global_avg_in_ms = round(1000. * n.global_avg, 1)
-    for n in root.iter_search_nodes():
-        s = sum([c.global_avg for c in n.children])
-        n.unique_in_ms = round(1000. * (n.global_avg - s), 1)
-    return root.get_ascii(attributes=
-        ['name', 'global_avg_in_ms', 'unique_in_ms', 'count'])
-
-def create_vis_net_file(speed_yaml, vis_txt):
-    info = load_from_yaml_file(speed_yaml)
-    if type(info) is list:
-        info = info[0]
-    assert type(info) is dict
-    component_speeds = info['meters']
-    write_to_file(get_vis_str(component_speeds), vis_txt)
-
-# ---------------------------------------------------------------------
-
-def dict_add(d, k, v):
-    if k not in d:
-        d[k] = v
-    else:
-        d[k] += v
-
-def calc_mean(x):
-    return sum(x) / len(x)
-
-def compare_gmap_evals(all_eval_file,
-        label_to_testcount=None,
-        output_prefix='out'):
-    result = ['\n']
-    all_result = [load_from_yaml_file(f) for f in all_eval_file]
-    all_cat2map = [result['class_ap'] for result in all_result]
-
-    cats = list(all_cat2map[0].keys())
-    gains = [all_cat2map[1][c] - all_cat2map[0][c] for c in cats]
-
-    all_info = [{'name': c, 'acc_gain': g} for c, g in zip(cats, gains)]
-    all_info = sorted(all_info, key=lambda x: x['acc_gain'])
-
-    all_map = [sum(cat2map.values()) / len(cat2map) for cat2map in all_cat2map]
-    result.append('all map = {}'.format(', '.join(
-        map(lambda x: str(round(x, 3)), all_map))))
-
-    non_zero_cats = [cat for cat, ap in all_cat2map[0].items()
-            if all_cat2map[1][cat] > 0 and  ap > 0]
-    logging.info('#non zero cat = {}'.format(len(non_zero_cats)))
-    for cat2map in all_cat2map:
-        logging.info('non zero cat mAP = {}'.format(
-            calc_mean([cat2map[c] for c in non_zero_cats])))
-
-    if label_to_testcount is not None:
-        all_valid_map = [calc_mean([ap for cat, ap in cat2map.items() if
-            label_to_testcount.get(cat, 0) >
-                50]) for cat2map in all_cat2map]
-        result.append('all valid map = {}'.format(', '.join(
-            map(lambda x: str(round(x, 3)), all_valid_map))))
-        valid_cats = set([l for l, c in label_to_testcount.items() if c > 50])
-
-    max_aps = [max([cat2map[c] for cat2map in all_cat2map]) for c in cats]
-    max_map = sum(max_aps) / len(max_aps)
-    result.append('max map = {:.3f}'.format(max_map))
-
-    for info in all_info:
-        for k in info:
-            if type(info[k]) is float:
-                info[k] = round(info[k], 2)
-    result.extend(get_table_print_lines(all_info[:5] + all_info[-6:], ['name',
-        'acc_gain',
-        ]))
-    if label_to_testcount is not None:
-        result.append('valid cats only:')
-        all_valid_info = [i for i in all_info if i['name'] in valid_cats]
-        result.extend(get_table_print_lines(all_valid_info[:5] + all_valid_info[-6:],
-            ['name', 'acc_gain',
-            ]))
-
-    all_acc_gain = [info['acc_gain'] for info in all_info]
-    logging.info('\n'.join(result))
-
-    plot_to_file(list(range(len(all_acc_gain))),
-            all_acc_gain,
-            output_prefix + '.png')
-
-def merge_class_names_by_location_id(anno):
-    if any('location_id' in a for a in anno):
-        assert all('location_id' in a for a in anno)
-        location_id_rect = [(a['location_id'], a) for a in anno]
-        from .qd_common import list_to_dict
-        location_id_to_rects = list_to_dict(location_id_rect, 0)
-        merged_anno = []
-        for _, rects in location_id_to_rects.items():
-            r = copy.deepcopy(rects[0])
-            r['class'] = [r['class']]
-            r['class'].extend((rects[i]['class'] for i in range(1,
-                len(rects))))
-            r['conf'] = [r.get('conf', 1)]
-            r['conf'].extend((rects[i].get('conf', 1) for i in range(1,
-                len(rects))))
-            merged_anno.append(r)
-        return merged_anno
-    else:
-        assert all('location_id' not in a for a in anno)
-        for a in anno:
-            a['class'] = [a['class']]
-            a['conf'] = [a.get('conf', 1.)]
-        return anno
-
-def softnms_c(rects, threshold=0, method=2, **kwargs):
-    from fast_rcnn.nms_wrapper import soft_nms
-    nms_input = np.zeros((len(rects), 5), dtype=np.float32)
-    for i, r in enumerate(rects):
-        nms_input[i, 0:4] = r['rect']
-        nms_input[i, -1] = r['conf']
-    nms_out = soft_nms(nms_input, threshold=threshold,
-            method=method, **kwargs)
-    return [{'rect': list(map(float, x[:4])), 'conf': float(x[-1])} for x in nms_out]
-
-def softnms(rects, th=0.5):
-    rects = copy.deepcopy(rects)
-    result = []
-    while len(rects) > 0:
-        max_idx = max(range(len(rects)), key=lambda i:
-                rects[i]['conf'])
-        max_det = rects[max_idx]
-        result.append(max_det)
-        rects.remove(max_det)
-        for j in range(len(rects)):
-            j_rect = rects[j]
-            ij_iou = calculate_iou1(max_det['rect'], j_rect['rect'])
-            rects[j]['conf'] *= math.exp(-ij_iou * ij_iou / th)
-    return result
-
-def acquireLock(lock_f='/tmp/lockfile.LOCK'):
-    ''' acquire exclusive lock file access '''
-    import fcntl
-    locked_file_descriptor = open(lock_f, 'w+')
-    fcntl.lockf(locked_file_descriptor, fcntl.LOCK_EX)
-    return locked_file_descriptor
-
-def releaseLock(locked_file_descriptor):
-    ''' release exclusive lock file access '''
-    locked_file_descriptor.close()
-
-def inject_yolo_by_maskrcnn_log_to_board(fname, folder):
-    keys = ['loss',
-            'cls',
-            'o_noobj',
-            'o_obj',
-            'wh',
-            'xy',
-            'time',
-            'data',
-            ]
-    pattern = ''.join('.*{}: ([0-9\.]*) \(([0-9\.]*)\).*'.format(k)for k in keys)
-    pattern = '.*iter: ([0-9]*) .*' + 'speed: ([0-9\.]*) images/sec' + pattern
-    logging.info(pattern)
-    all_loss = []
-    result = parse_nums(pattern, fname)
-    result_keys = ['iteration', 'speed']
-    for k in keys:
-        result_keys.append(k + '_medium')
-        result_keys.append(k + '_mean')
-    all_loss = [dict(zip(result_keys, r)) for r in result]
-
-    logging.info(len(all_loss))
-    from torch.utils.tensorboard import SummaryWriter
-    wt = SummaryWriter(log_dir=folder)
-    for loss_info in all_loss:
-        for k in result_keys:
-            if k == 'iteration':
-                continue
-            wt.add_scalar(tag=k, scalar_value=loss_info[k],
-                    global_step=loss_info['iteration'])
-
-def inject_maskrcnn_log_to_board(fname, folder, keys=None):
-    if keys is None:
-        keys = ['loss',
-                'criterion_loss',
-                #'loss_box_reg',
-                #'loss_classifier',
-                #'loss_objectness',
-                #'loss_rpn_box_reg',
-                'time',
-                'data',
-                ]
-    pattern = ''.join('.*{}: ([0-9\.]*) \(([0-9\.]*)\).*'.format(k)for k in keys)
-    pattern = '.*iter: ([0-9]*) .*' + 'speed: ([0-9\.]*) images/sec' + pattern
-    logging.info(pattern)
-    all_loss = []
-    result = parse_nums(pattern, fname)
-    result_keys = ['iteration', 'speed']
-    for k in keys:
-        result_keys.append(k + '_medium')
-        result_keys.append(k + '_mean')
-    all_loss = [dict(zip(result_keys, r)) for r in result]
-
-    logging.info(len(all_loss))
-    from torch.utils.tensorboard import SummaryWriter
-    wt = SummaryWriter(log_dir=folder)
-    for loss_info in all_loss:
-        for k in result_keys:
-            if k == 'iteration':
-                continue
-            wt.add_scalar(tag=k, scalar_value=loss_info[k],
-                    global_step=loss_info['iteration'])
-
-class DummyCfg(object):
-    # provide a signature of clone(), used by maskrcnn checkpointer
-    def clone(self):
-        return
-
-def save_frame_yaml(fn):
-    assert not op.isfile(fn)
-    assert fn.endswith('.yaml')
-    info = get_frame_info(1)
-    write_to_yaml_file(info, fn)
-
-def get_frame_info(last=0):
-    import inspect
-    frame = inspect.currentframe()
-    frames = inspect.getouterframes(frame)
-    frame = frames[1 + last].frame
-    args, _, _, vs = inspect.getargvalues(frame)
-    info = {i: vs[i] for i in args}
-    info['_func'] = frame.f_code.co_name
-    info['_filepath'] = frame.f_code.co_filename
-    return info
-
-def print_frame_info():
-    import inspect
-    frame = inspect.currentframe()
-    frames = inspect.getouterframes(frame)
-    frame = frames[1].frame
-    args, _, _, vs = inspect.getargvalues(frame)
-    info = []
-    info.append('func name = {}'.format(inspect.getframeinfo(frame)[2]))
-    for i in args:
-        try:
-            info.append('{} = {}'.format(i, vs[i]))
-        except:
-            info.append('type({}) = {}'.format(i, type(vs[i])))
-            continue
-    logging.info('; '.join(info))
-
-def merge_speed_info(speed_yamls, out_yaml):
-    write_to_yaml_file([load_from_yaml_file(y) for y in speed_yamls
-        if op.isfile(y)], out_yaml)
-
-def merge_speed_vis(vis_files, out_file):
-    from .qd_common import ensure_copy_file
-    if len(vis_files) > 0 and op.isfile(vis_files[0]):
-        ensure_copy_file(vis_files[0], out_file)
-
-def merge_dict_to_cfg(dict_param, cfg):
-    """merge the key, value pair in dict_param into cfg
-
-    :dict_param: TODO
-    :cfg: TODO
-    :returns: TODO
-
-    """
-    def trim_dict(d, c):
-        """remove all the keys in the dictionary of d based on the existance of
-        cfg
-        """
-        to_remove = [k for k in d if k not in c]
-        for k in to_remove:
-            del d[k]
-        to_check = [(k, d[k]) for k in d if d[k] is dict]
-        for k, t in to_check:
-            trim_dict(t, getattr(c, k))
-    trimed_param = copy.deepcopy(dict_param)
-    trim_dict(trimed_param, cfg)
-    from yacs.config import CfgNode
-    cfg.merge_from_other_cfg(CfgNode(trimed_param))
-
-def execute_func(info):
-    # info = {'from': module; 'import': func_name, 'param': dict}
-    from importlib import import_module
-    modules = import_module(info['from'])
-    if ('param' not in info) and ('args' not in info):
-        return getattr(modules, info['import'])()
-    elif ('param' in info) and ('args' not in info):
-        return getattr(modules, info['import'])(**info['param'])
-    elif ('param' not in info) and ('args' in info):
-        return getattr(modules, info['import'])(*info['args'])
-    else:
-        return getattr(modules, info['import'])(*info['args'], **info['param'])
-
-def detect_error_codes(log_file):
-    all_line = read_to_buffer(log_file).decode().split('\n')
-    error_codes = []
-    for _, line in enumerate(all_line):
-        if "raise RuntimeError('NaN encountered!')" in line:
-            error_codes.append('NaN')
-    return list(set(error_codes))
-
-def insensitive_glob(pattern):
-    def either(c):
-        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
-    return glob.glob(''.join(map(either, pattern)))
-
-def list_dir(folder):
-    return [f for f in os.listdir(folder) if op.isdir(op.join(folder, f))]
-
-def replace_place_holder(p, place_holder):
-    if isinstance(p, dict):
-        for k, v in p.items():
-            if type(v) == str and v.startswith('$'):
-                p[k] = place_holder[v[1:]]
-            else:
-                replace_place_holder(v, place_holder)
-    elif isinstance(p, list) or isinstance(p, tuple):
-        for i, x in enumerate(p):
-            if isinstance(x, str) and x.startswith('$'):
-                p[i] = place_holder[x[1:]]
-            else:
-                replace_place_holder(x, place_holder)
-
-def execute_pipeline(all_processor):
-    only_processors = [p for p in all_processor if p.get('only')]
-    need_check_processors = only_processors if len(only_processors) > 0 else all_processor
-    result = {}
-    result['process_result'] = []
-    place_holder = {}
-    for p in need_check_processors:
-        if p.get('ignore', False):
-            continue
-        replace_place_holder(p, place_holder)
-        if 'execute_if_true_else_break' in p:
-            r = execute_func(p['execute_if_true_else_break'])
-            if not r:
-                result['result'] = 'prereq_failed'
-                result['failed_prereq'] = p['execute_if_true_else_break']
-                break
-        if p.get('continue_if_true_else_break'):
-            r = execute_func(p['execute'])
-            if not r:
-                result['result'] = 'prereq_failed'
-                result['failed_prereq'] = p['execute']
-                break
-            else:
-                continue
-        if p.get('force', False):
-            r = run_if_not_cached(execute_func, p['execute'],
-                    __force=True)
-        else:
-            r = run_if_not_cached(execute_func, p['execute'])
-        if 'output' in p:
-            place_holder[p['output']] = r
-        else:
-            if r is not None:
-                result['process_result'].append({'process_info': p, 'return': r})
-        if p.get('stop_after'):
-            logging.info('skip the rest since stop_after=True')
-            break
-    result['place_holder'] = place_holder
-    if 'result' not in result:
-        result['result'] = 'pass'
-    return result
-
-def remove_empty_keys_(ds):
-    keys = set([k for d in ds for k in d])
-    empty_keys = [k for k in keys if all(d.get(k) is None for d in ds)]
-    for k in empty_keys:
-        for d in ds:
-            if k in d:
-                del d[k]
-
-def max_iter_mult(m, factor):
-    if isinstance(m, int):
-        return int(m * factor)
-    elif isinstance(m, str):
-        assert m.endswith('e')
-        return '{}e'.format(int(float(m[:-1]) * factor))
-    else:
-        raise NotImplementedError
-
-def remove_empty_coco_style(rects, w, h):
-    rects = [r for r in rects if r.get('iscrowd', 0) == 0]
-    ret = []
-    for r in rects:
-        x1, y1, x2, y2 = r['rect']
-        x1 = min(w, max(0, x1))
-        x2 = min(w, max(0, x2))
-        y1 = min(h, max(0, y1))
-        y2 = min(h, max(0, y2))
-        if y2 > y1 and x2 > x1:
-            r['rect'] = [x1, y1, x2, y2]
-            ret.append(r)
-    return ret
-
-def join_hints(hints, sep='_'):
-    parts = []
-    for h in hints:
-        if isinstance(h, dict):
-            parts.append(hash_sha1(h['hint'])[-h['max']:])
-        else:
-            parts.append(str(h))
-    return sep.join(parts)
-
-def qd_tqdm(*args, **kwargs):
-    desc = kwargs.get('desc', '')
-    import inspect
-    frame = inspect.currentframe()
-    frames = inspect.getouterframes(frame)
-    frame = frames[1].frame
-    line_number = frame.f_lineno
-    fname = op.basename(frame.f_code.co_filename)
-    message = '{}:{}'.format(fname, line_number)
-
-    if 'desc' in kwargs:
-        kwargs['desc'] = message + ' ' + desc
-    else:
-        kwargs['desc'] = message
-
-    if 'mininterval' not in kwargs:
-        # every 2 secons; default is 0.1 second which is too frequent
-        kwargs['mininterval'] = 2
-
-    return tqdm(*args, **kwargs)
-
-def get_opened_files():
-    import psutil
-    proc = psutil.Process()
-    return proc.open_files()
-
-def print_opened_files():
-    logging.info(pformat(get_opened_files()))
-
-def save_parameters(param, folder):
-    time_str = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
-
-    write_to_yaml_file(param, op.join(folder,
-        'parameters_{}.yaml'.format(time_str)))
-    # save the env parameters
-    # convert it to dict for py3
-    write_to_yaml_file(dict(os.environ), op.join(folder,
-        'env_{}.yaml'.format(time_str)))
-
-def exclusive_open_to_read(fname, mode='r'):
-    disable_lock = os.environ.get('QD_DISABLE_EXCLUSIVE_READ_BY_LOCK')
-    if disable_lock is not None:
-        disable_lock = int(disable_lock)
-    if not disable_lock:
-        user_name = get_user_name()
-        from .qd_common import acquireLock, releaseLock
-        lock_fd = acquireLock(op.join('/tmp',
-            '{}_lock_{}'.format(user_name, hash_sha1(fname))))
-    #try:
-    # in AML, it could fail with Input/Output error. If it fails, we will
-    # use azcopy as a fall back solution for reading
-    fp = limited_retry_agent(10, open, fname, mode)
-    #except:
-        #if 'FILE_OPEN_AZCOPY_BLOB_ACCOUNT_PATH' in os.environ:
-            #return azcopy_read(fname)
-        #else:
-            #raise
-    if not disable_lock:
-        releaseLock(lock_fd)
-    return fp
-
-def inject_log_to_board(fname, folder, pattern, keys):
-    logging.info(pattern)
-    from torch.utils.tensorboard import SummaryWriter
-
-    from .qd_common import iter_match_document
-    wt = SummaryWriter(log_dir=folder)
-    x_keys = [k['key'] for k in keys if k['is_x']]
-    x_key_time = any([
-        k for k in keys
-        if k['is_x'] and k.get('type') == 'time'
-    ])
-    if len(x_keys) == 1:
-        x_key = x_keys[0]
-    if len(x_keys) == 0:
-        x_key = None
-    added = 0
-    for i, r in enumerate(iter_match_document(pattern, fname)):
-        info = {}
-        for k, x in zip(keys, r):
-            if not k.get('type'):
-                info[k['key']] = float(x)
-            elif k['type'] == 'time':
-                info[k['key']] = datetime.datetime.strptime(
-                    x, '%Y-%m-%d %H:%M:%S')
-        added += 1
-        for k, v in info.items():
-            if x_key and k == x_key:
-                continue
-            args = {'tag': k, 'scalar_value': v}
-            if not x_key_time:
-                if x_key:
-                    args['global_step'] = info[x_key]
-                else:
-                    args['global_step'] = i
-            else:
-                args['walltime'] = (info[x_key]-datetime.datetime(1970,1,1)).total_seconds()
-            wt.add_scalar(**args)
-            #wt.add_scalar(tag=k, scalar_value=v, global_step=info[x_key])
-    logging.info(added)
-
-def auto_parse_log_line(line):
-    must_have = ['iter', 'speed', 'loss', 'lr']
-    result = {}
-    if not all(m in line for m in must_have):
-        # in this case, we will try to parse if there is like acc = {}
-        if '=' not in line:
-            return result
-        parts = re.split(':|,|;', line)
-        #parts = line.split(':')
-        for p in parts:
-            if '=' in p:
-                sub_parts = p.split('=')
-                if len(sub_parts) != 2:
-                    continue
-                k, v = map(lambda x: x.strip(), sub_parts)
-                try:
-                    result[k] = float(v)
-                except:
-                    continue
-        if len(result) > 0:
-            matched = re.match('([0-9-\s:]+)', list(line.split(','))[0])
-            if matched is not None:
-                x = matched.groups()[0]
-                try:
-                    result['time'] = datetime.strptime(
-                        x.strip(), '%Y-%m-%d %H:%M:%S')
-                except:
-                    return {}
-        return result
-    else:
-        parts = line.split('  ')
-        #([0-9-\s:]*)
-        for p in parts:
-            kv = list(map(lambda x: x.strip(), p.split(':')))
-            if len(kv) != 2:
-                continue
-            key = kv[0]
-            vs = list(map(lambda x: x.strip(' ()'), kv[1].split(' ')))
-            for idx_v, v in enumerate(vs):
-                try:
-                    v1 = float(vs[0])
-                    if len(vs) > 1:
-                        result[key + '_{}'.format(idx_v)] = v1
-                    else:
-                        result[key] = v1
-                except:
-                    continue
-        if len(result):
-            matched = re.match('([0-9-\s:]*)', line)
-            x = matched.groups()[0]
-            try:
-                result['time'] = datetime.strptime(
-                    x.strip(), '%Y-%m-%d %H:%M:%S')
-            except:
-                pass
-
-        return result
-
-
-def identity(x):
-    # only used in pipeline
-    return x
-
-def recursive_type_convert(info, t, convert_func):
-    if isinstance(info, (tuple, list)):
-        return [recursive_type_convert(i, t, convert_func) for i in info]
-    elif isinstance(info, dict):
-        return dict((k, recursive_type_convert(v, t, convert_func)) for k, v in info.items())
-    elif isinstance(info, t):
-        return convert_func(info)
-    else:
-        return info
-
-def blobfuse_umount(mount_folder):
-    cmd = ['sudo', 'umount', mount_folder]
-    cmd_run(cmd)
-
-def blobfuse_mount(account_name, container_name, account_key,
-                   mount_folder, cache_folder):
-    ensure_directory(mount_folder)
-    ensure_directory(cache_folder)
-    cmd = [
-        'blobfuse',
-        mount_folder,
-        '--tmp-path={}'.format(cache_folder),
-        '--container-name={}'.format(container_name),
-    ]
-    env = {
-        'AZURE_STORAGE_ACCOUNT': account_name,
-        'AZURE_STORAGE_ACCESS_KEY': account_key,
-    }
-    cmd_run(cmd, env=env)
-
-def blobfuse_mount_from_config(config, mount_folder):
-    info = load_from_yaml_file(config)
-    cache_folder = '/mnt/blobfuse/cache/{}'.format(hash_sha1(mount_folder))
-    blobfuse_mount(
-        account_name=info['account_name'],
-        container_name=info['container_name'],
-        account_key=info['account_key'],
-        mount_folder=mount_folder,
-        cache_folder=cache_folder,
-    )
-
-def query_all_opened_file_in_system():
-    fs = []
-    for proc in psutil.process_iter():
-        for proc in psutil.process_iter():
-            try:
-                for item in proc.open_files():
-                    fs.append(item.path)
-            except Exception:
-                pass
-    return list(set(fs))
-
-def has_handle(fpath, opened_files=None):
-    fpath = op.abspath(op.realpath(fpath))
-    if opened_files is None:
-        for proc in psutil.process_iter():
-            try:
-                for item in proc.open_files():
-                    if fpath == item.path:
-                        return True
-            except Exception:
-                pass
-        return False
-    else:
-        return fpath in opened_files
-
-def submit_to_evalai(fname, message, challenge_id, phase_id):
-    cmd = [
-        'evalai',
-        'challenge',
-        str(challenge_id),
-        'phase',
-        str(phase_id), 'submit', '--file',
-        fname
-    ]
-    input = 'y\n{}\n\n\n\n\n'.format(message)
-    import subprocess as sp
-    try:
-        submission_command_stdout = cmd_run(cmd,
-                         process_input=input.encode(),
-                         stdout=sp.PIPE,
-                         )[0].decode("utf-8")
-    except Exception as ex:
-        if 'The maximum number of submission for today' in str(ex):
-            logging.info(str(ex))
-            return
-        else:
-            raise
-    submission_id_regex = re.search("evalai submission ([0-9]+)", submission_command_stdout)
-    submission_id = submission_id_regex.group(0).split()[-1]
-    cmd = ["evalai", "submission", submission_id, "result"]
-    return ' '.join(cmd)
-
-def submit_to_evalai_for_vqa(fname, message):
-    return submit_to_evalai(fname, message, 830, 1793)
-
-def submit_to_evalai_for_nocaps_xd(fname, split, message):
-    challenge_id = 464
-    if split == 'val':
-        phase_id = 962
-    else:
-        assert split == 'test'
-        phase_id = 963
-    return submit_to_evalai(
-        fname, message, challenge_id, phase_id)
-
-def submit_to_evalai_for_nocaps(fname, split, message):
-    challenge_id = 355
-    if split == 'val':
-        phase_id = 742
-    else:
-        assert split == 'test'
-        phase_id = 743
-    return submit_to_evalai(
-        fname, message, challenge_id, phase_id)
-
-def recover_stdout_error():
-    import sys
-    sys.stdout = sys.__stdout__
-    sys.stderr = sys.__stderr__
-
-def switch_case(switch, case, default):
-    return case.get(switch, default)
-
-class wb(object):
-    initialized = False
-    enabled = True
-
-    @classmethod
-    def ensure_initialized(cls):
-        if not cls.initialized:
-            cls.initialized = True
-            cls.enabled = int(os.environ.get('QD_WANDB_ENABLED', '0'))
-            if get_mpi_rank() != 0:
-                cls.enabled = False
-            if cls.enabled:
-                try:
-                    import wandb
-
-                    # https://docs.wandb.ai/library/init#init-start-error
-                    wandb.init(settings=wandb.Settings(start_method="fork"))
-                except:
-                    print_trace()
-                    logging.info('init fails, disable wandb')
-                    cls.enabled = False
-
-    @classmethod
-    def watch(cls, *args, **kwargs):
-        cls.ensure_initialized()
-        if cls.enabled:
-            import wandb
-            wandb.watch(*args, **kwargs)
-
-    @classmethod
-    def log(cls, *args, **kwargs):
-        cls.ensure_initialized()
-        if cls.enabled:
-            import wandb
-            wandb.log(*args, **kwargs)
-
-if __name__ == '__main__':
-    init_logging()
-    kwargs = parse_general_args()
-    logging.info('param:\n{}'.format(pformat(kwargs)))
-    function_name = kwargs['type']
-    del kwargs['type']
-    locals()[function_name](**kwargs)
-
diff --git a/AVLFormer/src/utils/tsv_file.py b/AVLFormer/src/utils/tsv_file.py
deleted file mode 100755
index 347f704..0000000
--- a/AVLFormer/src/utils/tsv_file.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import logging
-import os
-import os.path as op
-
-from .comm import is_main_process
-from .qd_common import exclusive_open_to_read
-
-logger = logging.getLogger(__name__)
-if not is_main_process():
-    logger.disabled = True
-
-
-def create_lineidx(filein, idxout):
-    idxout_tmp = idxout + '.tmp'
-    with exclusive_open_to_read(filein, 'r') as tsvin, open(idxout_tmp,
-                                                            'w') as tsvout:
-        fsize = os.fstat(tsvin.fileno()).st_size
-        fpos = 0
-        while fpos != fsize:
-            tsvout.write(str(fpos) + "\n")
-            tsvin.readline()
-            fpos = tsvin.tell()
-    os.rename(idxout_tmp, idxout)
-
-
-def read_to_character(fp, c):
-    result = []
-    while True:
-        s = fp.read(32)
-        assert s != ''
-        if c in s:
-            result.append(s[:s.index(c)])
-            break
-        else:
-            result.append(s)
-    return ''.join(result)
-
-
-class TSVFile(object):
-
-    def __init__(self, tsv_file, generate_lineidx=False):
-        self.tsv_file = tsv_file
-        self.lineidx = op.splitext(tsv_file)[0] + '.lineidx'
-        self._fp = None
-        self._lineidx = None
-        # the process always keeps the process which opens the file.
-        # If the pid is not equal to the currrent pid, we will re-open the file.
-        self.pid = None
-        # generate lineidx if not exist
-        if not op.isfile(self.lineidx) and generate_lineidx:
-            create_lineidx(self.tsv_file, self.lineidx)
-
-    def __del__(self):
-        if self._fp:
-            self._fp.close()
-
-    def __str__(self):
-        return "TSVFile(tsv_file='{}')".format(self.tsv_file)
-
-    def __repr__(self):
-        return str(self)
-
-    def num_rows(self):
-        self._ensure_lineidx_loaded()
-        return len(self._lineidx)
-
-    def seek(self, idx):
-        self._ensure_tsv_opened()
-        self._ensure_lineidx_loaded()
-        try:
-            pos = self._lineidx[idx]
-        except:
-            logger.info('{}-{}'.format(self.tsv_file, idx))
-            raise
-        self._fp.seek(pos)
-        return [s.strip() for s in self._fp.readline().split('\t')]
-
-    def seek_first_column(self, idx):
-        self._ensure_tsv_opened()
-        self._ensure_lineidx_loaded()
-        pos = self._lineidx[idx]
-        self._fp.seek(pos)
-        return read_to_character(self._fp, '\t')
-
-    def get_key(self, idx):
-        return self.seek_first_column(idx)
-
-    def __getitem__(self, index):
-        return self.seek(index)
-
-    def __len__(self):
-        return self.num_rows()
-
-    def _ensure_lineidx_loaded(self):
-        if self._lineidx is None:
-            logger.info('loading lineidx: {}'.format(self.lineidx))
-            with exclusive_open_to_read(self.lineidx, 'r') as fp:
-                self._lineidx = [int(i.strip()) for i in fp.readlines()]
-
-    def _ensure_tsv_opened(self):
-        if self._fp is None:
-            self._fp = exclusive_open_to_read(self.tsv_file, 'r')
-            self.pid = os.getpid()
-
-        if self.pid != os.getpid():
-            logger.info('re-open {} because the process id changed'.format(
-                self.tsv_file))
-            self._fp = exclusive_open_to_read(self.tsv_file, 'r')
-            self.pid = os.getpid()
-
-
-class CompositeTSVFile():
-
-    def __init__(self, file_list, seq_file, root='.'):
-        if isinstance(file_list, str):
-            self.file_list = load_list_file(file_list)
-        else:
-            assert isinstance(file_list, list)
-            self.file_list = file_list
-
-        self.seq_file = seq_file
-        self.root = root
-        self.initialized = False
-        self.initialize()
-
-    def get_key(self, index):
-        idx_source, idx_row = self.seq[index]
-        k = self.tsvs[idx_source].get_key(idx_row)
-        return '_'.join([self.file_list[idx_source], k])
-
-    def num_rows(self):
-        return len(self.seq)
-
-    def __getitem__(self, index):
-        idx_source, idx_row = self.seq[index]
-        return self.tsvs[idx_source].seek(idx_row)
-
-    def __len__(self):
-        return len(self.seq)
-
-    def initialize(self):
-        '''
-        this function has to be called in init function if cache_policy is
-        enabled. Thus, let's always call it in init funciton to make it simple.
-        '''
-        if self.initialized:
-            return
-        self.seq = []
-        with exclusive_open_to_read(self.seq_file, 'r') as fp:
-            for line in fp:
-                parts = line.strip().split('\t')
-                self.seq.append([int(parts[0]), int(parts[1])])
-        self.tsvs = [TSVFile(op.join(self.root, f)) for f in self.file_list]
-        self.initialized = True
-
-    def get_composite_source_idx(self):
-        return [int(i) for i, _ in self.seq]
-
-
-def load_list_file(fname):
-    with exclusive_open_to_read(fname, 'r') as fp:
-        lines = fp.readlines()
-    result = [line.strip() for line in lines]
-    if len(result) > 0 and result[-1] == '':
-        result = result[:-1]
-    return result
diff --git a/AVLFormer/src/utils/tsv_file_ops.py b/AVLFormer/src/utils/tsv_file_ops.py
deleted file mode 100755
index 279b572..0000000
--- a/AVLFormer/src/utils/tsv_file_ops.py
+++ /dev/null
@@ -1,492 +0,0 @@
-import base64
-import json
-import math
-import os
-import os.path as op
-
-import cv2
-import numpy as np
-from tqdm import tqdm
-
-from .miscellaneous import (
-    ensure_directory,
-    exclusive_open_to_read,
-    load_from_yaml_file,
-    mkdir,
-    write_to_yaml_file,
-)
-from .tsv_file import TSVFile
-
-
-def img_from_base64(imagestring):
-    try:
-        jpgbytestring = base64.b64decode(imagestring)
-        nparr = np.frombuffer(jpgbytestring, np.uint8)
-        r = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-        return r
-    except ValueError:
-        return None
-
-
-def load_linelist_file(linelist_file):
-    if linelist_file is not None:
-        line_list = []
-        with exclusive_open_to_read(linelist_file, 'r') as fp:
-            for i in fp:
-                line_list.append(int(i.strip()))
-        return line_list
-
-
-def tsv_writer(values, tsv_file_name, sep='\t'):
-    ensure_directory(os.path.dirname(tsv_file_name))
-    tsv_lineidx_file = os.path.splitext(tsv_file_name)[0] + '.lineidx'
-    tsv_8b_file = tsv_lineidx_file + '.8b'
-    idx = 0
-    tsv_file_name_tmp = tsv_file_name + '.tmp'
-    tsv_lineidx_file_tmp = tsv_lineidx_file + '.tmp'
-    tsv_8b_file_tmp = tsv_8b_file + '.tmp'
-    import sys
-    is_py2 = sys.version_info.major == 2
-    if not is_py2:
-        sep = sep.encode()
-    with open(tsv_file_name_tmp,
-              'wb') as fp, open(tsv_lineidx_file_tmp,
-                                'w') as fpidx, open(tsv_8b_file_tmp,
-                                                    'wb') as fp8b:
-        assert values is not None
-        for value in values:
-            assert value is not None
-            if is_py2:
-                v = sep.join(
-                    map(
-                        lambda v: v.encode('utf-8')
-                        if isinstance(v, unicode) else str(v), value)) + '\n'
-            else:
-                value = map(
-                    lambda v: v
-                    if type(v) == bytes else str(v).encode(), value)
-                v = sep.join(value) + b'\n'
-            fp.write(v)
-            fpidx.write(str(idx) + '\n')
-            # although we can use sys.byteorder to retrieve the system-default
-            # byte order, let's use little always to make it consistent and
-            # simple
-            fp8b.write(idx.to_bytes(8, 'little'))
-            idx = idx + len(v)
-    # the following might crash if there are two processes which are writing at
-    # the same time. One process finishes the renaming first and the second one
-    # will crash. In this case, we know there must be some errors when you run
-    # the code, and it should be a bug to fix rather than to use try-catch to
-    # protect it here.
-    os.rename(tsv_file_name_tmp, tsv_file_name)
-    os.rename(tsv_lineidx_file_tmp, tsv_lineidx_file)
-    os.rename(tsv_8b_file_tmp, tsv_8b_file)
-
-
-def tsv_reader(tsv_file, sep='\t'):
-    with exclusive_open_to_read(tsv_file, 'r') as fp:
-        for i, line in enumerate(fp):
-            yield [x.strip() for x in line.split(sep)]
-
-
-def config_save_file(tsv_file, save_file=None, append_str='.new.tsv'):
-    if save_file is not None:
-        return save_file
-    return op.splitext(tsv_file)[0] + append_str
-
-
-def get_line_list(linelist_file=None, num_rows=None):
-    if linelist_file is not None:
-        return load_linelist_file(linelist_file)
-
-    if num_rows is not None:
-        return [i for i in range(num_rows)]
-
-
-def generate_hw_file(img_file, save_file=None):
-    rows = tsv_reader(img_file)
-
-    def gen_rows():
-        for i, row in tqdm(enumerate(rows)):
-            row1 = [row[0]]
-            img = img_from_base64(row[-1])
-            height = img.shape[0]
-            width = img.shape[1]
-            row1.append(json.dumps([{"height": height, "width": width}]))
-            yield row1
-
-    save_file = config_save_file(img_file, save_file, '.hw.tsv')
-    tsv_writer(gen_rows(), save_file)
-
-
-def generate_labelmap_file(label_file, save_file=None):
-    rows = tsv_reader(label_file)
-    labelmap = []
-    for i, row in enumerate(rows):
-        labelmap.extend(set([rect['class'] for rect in json.loads(row[1])]))
-    labelmap = sorted(list(set(labelmap)))
-
-    save_file = config_save_file(label_file, save_file, '.labelmap.tsv')
-    with open(save_file, 'w') as f:
-        f.write('\n'.join(labelmap))
-
-
-def extract_column(tsv_file, col=1, save_file=None):
-    rows = tsv_reader(tsv_file)
-
-    def gen_rows():
-        for i, row in enumerate(rows):
-            row1 = [row[0], row[col]]
-            yield row1
-
-    save_file = config_save_file(tsv_file, save_file,
-                                 '.col.{}.tsv'.format(col))
-    tsv_writer(gen_rows(), save_file)
-
-
-def remove_column(tsv_file, col=1, save_file=None):
-    rows = tsv_reader(tsv_file)
-
-    def gen_rows():
-        for i, row in enumerate(rows):
-            del row[col]
-            yield row
-
-    save_file = config_save_file(tsv_file, save_file,
-                                 '.remove.{}.tsv'.format(col))
-    tsv_writer(gen_rows(), save_file)
-
-
-def generate_linelist_file(label_file, save_file=None, ignore_attrs=()):
-    # generate a list of image that has labels
-    # images with only ignore labels are not selected.
-    line_list = []
-    rows = tsv_reader(label_file)
-    for i, row in tqdm(enumerate(rows)):
-        labels = json.loads(row[1])
-        if labels:
-            if ignore_attrs and all([any([lab[attr] for attr in ignore_attrs if attr in lab]) \
-                                for lab in labels]):
-                continue
-            line_list.append([i])
-
-    save_file = config_save_file(label_file, save_file, '.linelist.tsv')
-    tsv_writer(line_list, save_file)
-
-
-def random_drop_labels(label_file,
-                       drop_ratio,
-                       linelist_file=None,
-                       save_file=None,
-                       drop_image=False):
-    # randomly drop labels by the ratio
-    # if drop_image is true, can drop an image by removing all labels
-    # otherwise will keep at least one label for each image to make sure
-    # the number of images is equal
-    rows = tsv_reader(label_file)
-    line_list = get_line_list(linelist_file)
-    rows_new = []
-    cnt_original = 0
-    cnt_new = 0
-    for i, row in enumerate(rows):
-        if line_list and (i not in line_list):
-            row_new = [row[0], json.dumps([])]
-        else:
-            labels = json.loads(row[1])
-            if len(labels) == 0:
-                labels_new = []
-            else:
-                rand = np.random.random(len(labels))
-                labels_new = [
-                    obj for j, obj in enumerate(labels)
-                    if rand[j] >= drop_ratio
-                ]
-                if not drop_image and not labels_new:
-                    # make sure there is at least one label if drop image is not allowed
-                    labels_new = [labels[0]]
-            cnt_original += len(labels)
-            cnt_new += len(labels_new)
-            row_new = [row[0], json.dumps(labels_new)]
-        rows_new.append(row_new)
-
-    save_file = config_save_file(label_file, save_file,
-                                 '.drop.{}.tsv'.format(drop_ratio))
-    tsv_writer(rows_new, save_file)
-    print("original labels = {}".format(cnt_original))
-    print("new labels = {}".format(cnt_new))
-    print("given drop_ratio = {}".format(drop_ratio))
-    print("real drop_ratio = {}".format(
-        float(cnt_original - cnt_new) / cnt_original))
-
-
-def merge_two_label_files(label_file1, label_file2, save_file=None):
-    rows1 = tsv_reader(label_file1)
-    rows2 = tsv_reader(label_file2)
-
-    rows_new = []
-    for row1, row2 in zip(rows1, rows2):
-        assert row1[0] == row2[0]
-        labels = json.loads(row1[1]) + json.loads(row2[1])
-        rows_new.append([row1[0], json.dumps(labels)])
-
-    save_file = config_save_file(label_file1, save_file, '.merge.tsv')
-    tsv_writer(rows_new, save_file)
-
-
-def is_same_keys_for_files(tsv_file1,
-                           tsv_file2,
-                           linelist_file1=None,
-                           linelist_file2=None):
-    # check if two files have the same keys for all rows
-    tsv1 = TSVFile(tsv_file1)
-    tsv2 = TSVFile(tsv_file2)
-    line_list1 = get_line_list(linelist_file1, tsv1.num_rows())
-    line_list2 = get_line_list(linelist_file2, tsv2.num_rows())
-    assert len(line_list1) == len(line_list2)
-    for idx1, idx2 in zip(line_list1, line_list2):
-        row1 = tsv1.seek(idx1)
-        row2 = tsv2.seek(idx2)
-        if row1[0] == row2[0]:
-            continue
-        else:
-            print("key mismatch {}-{}".format(row1[0], row2[0]))
-            return False
-    return True
-
-
-def sort_file_based_on_keys(ref_file, tsv_file, save_file=None):
-    # sort tsv_file to have the same key in each row as ref_file
-    if is_same_keys_for_files(ref_file, tsv_file):
-        print("file keys are the same, skip sorting")
-        return tsv_file
-
-    ref_keys = [row[0] for row in tsv_reader(ref_file)]
-    all_keys = [row[0] for row in tsv_reader(tsv_file)]
-    indexes = [all_keys.index(key) for key in ref_keys]
-    tsv = TSVFile(tsv_file)
-
-    def gen_rows():
-        for idx in indexes:
-            yield tsv.seek(idx)
-
-    save_file = config_save_file(tsv_file, save_file, '.sorted.tsv')
-    tsv_writer(gen_rows(), save_file)
-
-
-def reorder_tsv_keys(in_tsv_file, ordered_keys, out_tsv_file):
-    tsv = TSVFile(in_tsv_file)
-    keys = [tsv.seek(i)[0] for i in tqdm(range(len(tsv)))]
-    key_to_idx = {key: i for i, key in enumerate(keys)}
-
-    def gen_rows():
-        for key in tqdm(ordered_keys):
-            idx = key_to_idx[key]
-            yield tsv.seek(idx)
-
-    tsv_writer(gen_rows(), out_tsv_file)
-
-
-def reorder_tsv_keys_with_file(in_tsv_file, ref_tsv_file, out_tsv_file):
-    ordered_keys = [row[0] for row in tsv_reader(ref_tsv_file)]
-    reorder_tsv_keys(in_tsv_file, ordered_keys, out_tsv_file)
-
-
-def convert_caption_json_to_tsv(caption_json_file, key_tsv_file, out_tsv_file):
-    keys = [row[0] for row in tsv_reader(key_tsv_file)]
-    rows_dict = {key: [] for key in keys}
-
-    with open(caption_json_file, 'r') as f:
-        captions = json.load(f)
-
-    for cap in captions:
-        image_id = cap['image_id']
-        del cap['image_id']
-        if image_id in rows_dict:
-            rows_dict[image_id].append(cap)
-
-    rows = [[key, json.dumps(rows_dict[key])] for key in keys]
-    tsv_writer(rows, out_tsv_file)
-
-
-def generate_caption_linelist_file(caption_tsv_file, save_file=None):
-    num_captions = []
-    for row in tsv_reader(caption_tsv_file):
-        num_captions.append(len(json.loads(row[1])))
-
-    cap_linelist = [
-        '\t'.join([str(img_idx), str(cap_idx)])
-        for img_idx in range(len(num_captions))
-        for cap_idx in range(num_captions[img_idx])
-    ]
-    save_file = config_save_file(caption_tsv_file, save_file, '.linelist.tsv')
-    with open(save_file, 'w') as f:
-        f.write('\n'.join(cap_linelist))
-
-
-def convert_feature_format(in_tsv, out_tsv, fea_dim=None):
-    # convert the old feature file format to new one
-    # set fea_dim to remove spatial features if necessary.
-    def gen_rows():
-        for row in tqdm(tsv_reader(in_tsv)):
-            key = row[0]
-            feat_info = json.loads(row[1])
-            num_boxes = feat_info['num_boxes']
-            features = np.frombuffer(base64.b64decode(feat_info['features']),
-                                     np.float32).reshape(num_boxes, -1)
-            if fea_dim:
-                feat_info_new = [{
-                    'feature':
-                    base64.b64encode(features[i][:fea_dim]).decode('utf-8')
-                } for i in range(num_boxes)]
-            else:
-                feat_info_new = [{
-                    'feature':
-                    base64.b64encode(features[i]).decode('utf-8')
-                } for i in range(num_boxes)]
-            yield [key, json.dumps(feat_info_new)]
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def convert_feature_format2(in_tsv, out_tsv, fea_dim=None):
-    # new format from Pengchuan
-    def gen_rows():
-        for row in tqdm(tsv_reader(in_tsv)):
-            key = row[0]
-            num_boxes = int(row[1])
-            features = np.frombuffer(base64.b64decode(row[2]),
-                                     np.float32).reshape(num_boxes, -1)
-            if fea_dim:
-                feat_info = [{
-                    'feature':
-                    base64.b64encode(features[i][:fea_dim]).decode('utf-8')
-                } for i in range(num_boxes)]
-            else:
-                feat_info = [{
-                    'feature':
-                    base64.b64encode(features[i]).decode('utf-8')
-                } for i in range(num_boxes)]
-            yield [key, json.dumps(feat_info)]
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def merge_label_fields(in_tsv1, in_tsv2, out_tsv):
-    # merge the label fields for each box
-    def gen_rows():
-        for row1, row2 in tqdm(zip(tsv_reader(in_tsv1), tsv_reader(in_tsv2))):
-            assert row1[0] == row2[0]
-            label_info1 = json.loads(row1[1])
-            label_info2 = json.loads(row2[1])
-            assert len(label_info1) == len(label_info2)
-            for lab1, lab2 in zip(label_info1, label_info2):
-                lab1.update(lab2)
-            yield [row1[0], json.dumps(label_info1)]
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def remove_label_fields(in_tsv, out_tsv, remove_fields):
-    if type(remove_fields) == str:
-        remove_fields = [remove_fields]
-    assert type(remove_fields) == list
-
-    def gen_rows():
-        for row in tqdm(tsv_reader(in_tsv)):
-            label_info = json.loads(row[1])
-            for lab in label_info:
-                for field in remove_fields:
-                    if field in lab:
-                        del lab[field]
-            yield [row[0], json.dumps(label_info)]
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def random_permute_label_file(in_tsv, out_tsv):
-    # take a label file as input and randomly match image
-    # with the label from a different image
-    tsv = TSVFile(in_tsv)
-    random_index = np.random.permutation(tsv.num_rows())
-
-    def gen_rows():
-        for idx, rand_idx in enumerate(random_index):
-            key = tsv.seek(idx)[0]
-            labels = tsv.seek(rand_idx)[1]
-            yield [key, labels]
-
-    tsv_writer(gen_rows(), out_tsv)
-    # save the random index for reference
-    save_file = op.splitext(out_tsv)[0] + '.random_index.tsv'
-    with open(save_file, 'w') as f:
-        f.write('\n'.join([str(idx) for idx in random_index]))
-
-
-def create_mini_yaml_with_linelist(in_yaml, num_files):
-    # create linelist files to split a yaml into multiple ones
-    # useful for inference on large-scale dataset
-    data_cfg = load_from_yaml_file(in_yaml)
-    data_dir = op.dirname(in_yaml)
-    split_name = op.basename(in_yaml).split('.')[0]
-    hw_file = op.join(data_dir, data_cfg['hw'])
-    num_rows = TSVFile(hw_file).num_rows()
-    rows_per_file = math.ceil(num_rows / num_files)
-    for idx in range(num_files):
-        start_idx = idx * rows_per_file
-        end_idx = min(start_idx + rows_per_file, num_rows)
-        linelist = [str(i) for i in range(start_idx, end_idx)]
-        linelist_file = op.join(data_dir,
-                                split_name + '.linelist_{}.tsv'.format(idx))
-        print("create linelist file: " + linelist_file)
-        with open(linelist_file, 'w') as f:
-            f.write('\n'.join(linelist))
-        data_cfg['linelist'] = op.basename(linelist_file)
-        out_yaml = op.splitext(in_yaml)[0] + '_{}.yaml'.format(idx)
-        write_to_yaml_file(data_cfg, out_yaml)
-        print("create yaml file: " + out_yaml)
-
-
-def mapping_labels(in_tsv, out_tsv, label_mapping_dict):
-
-    def gen_rows():
-        for row in tsv_reader(in_tsv):
-            label_info = json.loads(row[1])
-            for lab in label_info:
-                if lab['class'] in label_mapping_dict:
-                    lab['class'] = label_mapping_dict[lab['class']]
-            yield [row[0], json.dumps(label_info)]
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def select_rows_in_linelist(in_tsv, out_tsv, linelist_file):
-    tsv = TSVFile(in_tsv)
-    line_list = load_linelist_file(linelist_file)
-
-    def gen_rows():
-        for idx in line_list:
-            yield tsv.seek(idx)
-
-    tsv_writer(gen_rows(), out_tsv)
-
-
-def generate_full_region_label_file(hw_tsv, out_tsv, class_name=None):
-    # given a height/width file, generate a label file
-    def gen_rows():
-        for row in tsv_reader(hw_tsv):
-            try:
-                data = json.loads(row[1])
-                assert type(data) in (list, dict)
-                if type(data) == list:
-                    height, width = data[0]['height'], data[0]['width']
-                else:
-                    height, width = data['height'], data['width']
-            except ValueError:
-                hw_str = row[1].split(' ')
-                height, width = int(hw_str[0]), int(hw_str[1])
-            label = {'rect': [0, 0, width, height]}
-            if class_name:
-                label.update({'class': class_name})
-            yield [row[0], json.dumps([label])]
-
-    tsv_writer(gen_rows(), out_tsv)
diff --git a/AVLFormer/src/utils/tsv_io.py b/AVLFormer/src/utils/tsv_io.py
deleted file mode 100644
index 7e73eb3..0000000
--- a/AVLFormer/src/utils/tsv_io.py
+++ /dev/null
@@ -1,1689 +0,0 @@
-import glob
-import json
-import logging
-import mmap
-import os
-import os.path as op
-import random
-import re
-import shutil
-import time
-
-from .qd_common import (
-    copy_file,
-    ensure_directory,
-    exclusive_open_to_read,
-    get_file_size,
-    get_user_name,
-    hash_sha1,
-)
-
-try:
-    from itertools import izip as zip
-except ImportError:
-    # python 3
-    pass
-import progressbar
-
-from .qd_common import qd_tqdm as tqdm
-
-logger = logging.getLogger(__name__)
-
-
-def get_default_splits():
-    return ['train', 'trainval', 'test', 'val']
-
-
-def get_tsv_lineidx(tsv_file):
-    #assert tsv_file.endswith('.tsv') or tsv_file.endswith('.txt')
-    return tsv_file[:-3] + 'lineidx'
-
-
-def get_tsv_lineidx_8b(tsv_file):
-    return tsv_file[:-3] + 'lineidx.8b'
-
-
-def get_tsv_associates(tsv_file):
-    return [get_tsv_lineidx(tsv_file), get_tsv_lineidx_8b(tsv_file)]
-
-
-def rm_tsv(tsv_file):
-    if op.isfile(tsv_file):
-        os.remove(tsv_file)
-        for line_idx in [
-                get_tsv_lineidx(tsv_file),
-                get_tsv_lineidx_8b(tsv_file)
-        ]:
-            if op.isfile(line_idx):
-                os.remove(line_idx)
-
-
-def tsv_rm(tsv_file):
-    rm_tsv(tsv_file)
-
-
-def tsv_copy(src_tsv, dst_tsv):
-    copy_file(src_tsv, dst_tsv)
-    for s, t in zip(get_tsv_associates(src_tsv), get_tsv_associates(dst_tsv)):
-        if op.isfile(s):
-            copy_file(s, t)
-
-
-def tsv_mv(src_file, dst_file):
-    shutil.move(src_file, dst_file)
-    for s, t in zip(get_tsv_associates(src_file),
-                    get_tsv_associates(dst_file)):
-        if op.isfile(s):
-            shutil.move(s, t)
-
-
-def reorder_tsv_keys(in_tsv_file, ordered_keys, out_tsv_file):
-    tsv = TSVFile(in_tsv_file)
-    logger.info('loading keys in input')
-    keys = [
-        tsv.seek_first_column(i) for i in tqdm(range(len(tsv)), mininterval=2)
-    ]
-    key_to_idx = {key: i for i, key in enumerate(keys)}
-
-    def gen_rows():
-        logger.info('writing')
-        for key in tqdm(ordered_keys, mininterval=2):
-            idx = key_to_idx[key]
-            yield tsv.seek(idx)
-
-    tsv_writer(gen_rows(), out_tsv_file)
-
-
-def read_to_character(fp, c):
-    result = []
-    while True:
-        s = fp.read(32)
-        assert s != ''
-        if c in s:
-            result.append(s[:s.index(c)])
-            break
-        else:
-            result.append(s)
-    return ''.join(result)
-
-
-class CompositeTSVFile(object):
-
-    def __init__(self,
-                 list_file,
-                 seq_file,
-                 cache_policy=None,
-                 hold_buffer=0,
-                 root='.'):
-        # list_file can be a loaded or constructed pair of index, rather than a
-        # filename to load. In this case, seq_file will be a list of dataset,
-        # which should implement len() and __getitem__() so that we can
-        # reference it.
-        self.root = root
-        self.seq_file = seq_file
-        self.list_file = list_file
-        self.cache_policy = cache_policy
-        self.seq = None
-        self.tsvs = []
-
-        # please do ont call ensure_initialized here. we wil always do it
-        # lazily. we may load a huge amount of seq, which could be super slow
-        # when spawning multiple processes.
-
-        # this means, how many tsv fp pointer we will hold. If it is 0 or less
-        # than 0, we will hold all fp pointers we need. If it is larger than 0,
-        # we only hold some, which are kept in self.hold_sources
-        self.hold_buffer = hold_buffer
-        self.hold_sources = []
-
-    def __repr__(self):
-        return 'CompositeTSVFile(list_file={}, seq_file={})'.format(
-            self.seq_file, self.list_file)
-
-    def get_row_len(self, i):
-        self.ensure_initialized()
-        idx_source, idx_row, _ = map(int, self.seq[i])
-        result = self.tsvs[idx_source].get_row_len(idx_row)
-        return result
-
-    def __getitem__(self, index):
-        self.ensure_initialized()
-        idx_source, idx_row, _ = map(int, self.seq[index])
-        start = time.time()
-        result = self.tsvs[idx_source].seek(idx_row)
-        end = time.time()
-        if end - start > 10:
-            logging.warning(
-                'too long to load fname = {}, source={}, row={}, time={}'.
-                format(self.tsvs[idx_source], idx_source, idx_row,
-                       end - start))
-        if self.hold_buffer > 0 and idx_source not in self.hold_sources:
-            if len(self.hold_sources) >= self.hold_buffer:
-                close_idx_source = self.hold_sources.pop(0)
-                self.tsvs[close_idx_source].close_fp()
-            self.hold_sources.append(idx_source)
-        return result
-
-    def __len__(self):
-        self.ensure_initialized()
-        return len(self.seq)
-
-    def __iter__(self):
-        self.ensure_initialized()
-        self.next_row = 0
-        self.seq = iter(self.seq)
-        return self
-
-    def __next__(self):
-        # this function is not well tested. let's have a breakpoint here
-        if self.next_row >= len(self):
-            raise StopIteration
-        idx_source, idx_row = map(int, next(self.seq))
-        return self.tsvs[idx_source][idx_row]
-
-    def release(self):
-        # this is to ensure we released all the resources
-        self.seq = None
-        for t in self.tsvs:
-            t.close()
-
-    def close(self):
-        self.release()
-
-    def seek_first_column(self, index):
-        self.ensure_initialized()
-        idx_source, idx_row, _ = map(int, self.seq[index])
-        return self.tsvs[idx_source].seek_first_column(idx_row)
-
-    def get_composite_source_idx(self):
-        return [int(i) for i, _ in self.seq]
-
-    def ensure_initialized(self):
-        if self.seq is None:
-            if isinstance(self.list_file, str) and \
-                    isinstance(self.seq_file, str):
-                self.seq = TSVFile(self.seq_file)
-                self.file_list = load_list_file(self.list_file)
-                self.tsvs = [
-                    TSVFile(op.join(self.root, f), self.cache_policy)
-                    for f in self.file_list
-                ]
-            elif isinstance(self.list_file, list) and \
-                    isinstance(self.seq_file, str):
-                self.seq = TSVFile(self.seq_file)
-                self.file_list = self.list_file
-                self.tsvs = [
-                    TSVFile(op.join(self.root, f), self.cache_policy)
-                    for f in self.file_list
-                ]
-            else:
-                # Not sure when to use this one
-                self.seq = self.list_file
-                self.tsvs = self.seq_file
-
-    def num_rows(self):
-        return self.__len__()
-
-    def get_key(self, index):
-        idx_source, idx_row, _ = map(int, self.seq[index])
-        k = self.tsvs[idx_source].get_key(idx_row)
-        return '_'.join([self.file_list[idx_source], k])
-
-
-class TSVFile(object):
-
-    def __init__(self, tsv_file, cache_policy=None):
-        self.tsv_file = tsv_file
-        self.lineidx = op.splitext(tsv_file)[0] + '.lineidx'
-        self.lineidx_8b = self.lineidx + '.8b'
-        self._fp = None
-        self._mfp = None
-        self._lineidx = None
-        self.fp8b = None
-        self.cache_policy = cache_policy
-        self.close_fp_after_read = False
-        if os.environ.get('QD_TSV_CLOSE_FP_AFTER_READ'):
-            self.close_fp_after_read = bool(
-                os.environ['QD_TSV_CLOSE_FP_AFTER_READ'])
-        self.use_mmap = False
-        if os.environ.get('QD_TSV_MMAP'):
-            self.use_mmap = int(os.environ['QD_TSV_MMAP'])
-        self.use_fuse = False
-        # if os.environ.get('QD_TSV_USE_FUSE'):
-        #     self.use_fuse = int(os.environ['QD_TSV_USE_FUSE'])
-        #     from .cloud_storage import create_cloud_fuse
-        #     self.fuser = create_cloud_fuse()
-        self.has_lineidx_8b = int(os.environ.get('QD_USE_LINEIDX_8B', '0'))
-        # the process always keeps the process which opens the
-        # file. If the pid is not equal to the currrent pid, we will re-open
-        # teh file.
-        self.pid = None
-        self.lineidx_8b_pid = None
-        self.open_once = False
-
-        self._cache()
-        self._len = None
-
-    def get_row_len(self, i):
-        start = self.get_offset(i)
-        if i < len(self) - 1:
-            end = self.get_offset(i + 1)
-        else:
-            end = QDFile.get_file_size(self.tsv_file)
-        return end - start
-
-    def close_fp(self):
-        if self._fp:
-            self._fp.close()
-            self._fp = None
-        if self._mfp:
-            self._mfp.close()
-            self._mfp = None
-        if self.has_lineidx_8b and self.fp8b:
-            self.fp8b.close()
-            self.fp8b = None
-
-    def release(self):
-        self.close_fp()
-        self._lineidx = None
-
-    def close(self):
-        #@deprecated('use release to make it more clear not to release lineidx')
-        self.close_fp()
-
-    def __del__(self):
-        self.release()
-
-    def __str__(self):
-        return "TSVFile(tsv_file='{}')".format(self.tsv_file)
-
-    def __repr__(self):
-        return str(self)
-
-    def __iter__(self):
-        self._ensure_tsv_opened()
-        self.fp_seek(0)
-        self.next_row = 0
-        return self
-
-    def __next__(self):
-        if self.next_row >= len(self):
-            raise StopIteration
-        self.next_row += 1
-        result = self.get_current_column()
-        return result
-
-    def num_rows(self):
-        if self._len is None:
-            if self.has_lineidx_8b:
-                self._len = QDFile.get_file_size(self.lineidx_8b) // 8
-            else:
-                self._ensure_lineidx_loaded()
-                self._len = len(self._lineidx)
-        return self._len
-
-    def get_key(self, idx):
-        return self.seek_first_column(idx)
-
-    def get_current_column(self):
-        if self.use_mmap:
-            result = [
-                s.strip() for s in self._mfp.readline().decode().split('\t')
-            ]
-        else:
-            result = [s.strip() for s in self._fp.readline().split('\t')]
-        return result
-
-    def fp_seek(self, pos):
-        if self.use_mmap:
-            self._mfp.seek(pos)
-        else:
-            self._fp.seek(pos)
-
-    def seek(self, idx):
-        self._ensure_tsv_opened()
-        pos = self.get_offset(idx)
-        self.fp_seek(pos)
-        result = self.get_current_column()
-        if self.close_fp_after_read:
-            self.close_fp()
-        return result
-
-    def seek_first_column(self, idx):
-        self._ensure_tsv_opened()
-        pos = self.get_offset(idx)
-        self._fp.seek(pos)
-        return read_to_character(self._fp, '\t')
-
-    def open(self, fname, mode):
-        if self.use_fuse:
-            return self.fuser.open(fname, mode)
-        else:
-            return exclusive_open_to_read(fname, mode)
-
-    def ensure_lineidx_8b_opened(self):
-        if self.fp8b is None:
-            self.fp8b = self.open(self.lineidx_8b, 'rb')
-            self.lineidx_8b_pid = os.getpid()
-        if self.lineidx_8b_pid != os.getpid():
-            self.fp8b.close()
-            logger.info('re-open {} because the process id changed'.format(
-                self.lineidx_8b))
-            self.fp8b = self.open(self.lineidx_8b, 'rb')
-            self.lineidx_8b_pid = os.getpid()
-
-    def get_offset(self, idx):
-        # do not use op.isfile() to check whether lineidx_8b exists as it may
-        # incur API call for blobfuse, which will be super slow if we enumerate
-        # a bunch of data
-        if self.has_lineidx_8b:
-            self.ensure_lineidx_8b_opened()
-            self.fp8b.seek(idx * 8)
-            return int.from_bytes(self.fp8b.read(8), 'little')
-        else:
-            self._ensure_lineidx_loaded()
-            pos = self._lineidx[idx]
-            return pos
-
-    def __getitem__(self, index):
-        return self.seek(index)
-
-    def __len__(self):
-        return self.num_rows()
-
-    def _ensure_lineidx_loaded(self):
-        if self._lineidx is None:
-            # please do not check if it is expired. Reason: if we copy the data from somewhere else, the timestamp might not be kepts
-            #if not op.isfile(self.lineidx) and not op.islink(self.lineidx):
-            #generate_lineidx(self.tsv_file, self.lineidx)
-            #with open(self.lineidx, 'r') as fp:
-
-            #with limited_retry_agent(10, open, self.lineidx, 'r') as fp:
-            if self.use_fuse:
-                with self.fuser.open(self.lineidx, 'r') as fp:
-                    self._lineidx = tuple(
-                        [int(i.strip()) for i in fp.readlines()])
-            else:
-                with exclusive_open_to_read(self.lineidx) as fp:
-                    self._lineidx = tuple(
-                        [int(i.strip()) for i in fp.readlines()])
-            logger.info('loaded {} from {}'.format(len(self._lineidx),
-                                                   self.lineidx))
-
-    def _cache(self):
-        if self.cache_policy == 'memory':
-            # make sure the tsv is opened here. don't put it in seek. If we put
-            # it in the first call of seek(), it is loading all the content
-            # there. With multi-workers in pytorch, each worker has to read all
-            # the files and cache it to memory. If we load it here in the main
-            # thread, it won't copy it to each worker
-            logger.info('caching {} to memory'.format(self.tsv_file))
-            from io import StringIO
-            result = StringIO()
-            total = op.getsize(self.tsv_file)
-            import psutil
-            avail = psutil.virtual_memory().available
-            if avail < total:
-                logger.info(
-                    'not enough memory to cache {} < {}. fall back'.format(
-                        avail, total))
-            else:
-                pbar = tqdm(total=total / 1024. / 1024.)
-                with open(self.tsv_file, 'r') as fp:
-                    while True:
-                        x = fp.read(1024 * 1024 * 100)
-                        if len(x) == 0:
-                            break
-                        pbar.update(len(x) / 1024. / 1024.)
-                        result.write(x)
-                self._fp = result
-
-        elif self.cache_policy == 'tmp':
-            tmp_tsvfile = op.join('/tmp', self.tsv_file)
-            tmp_lineidx = op.join('/tmp', self.lineidx)
-            ensure_directory(op.dirname(tmp_tsvfile))
-
-            from .qd_common import ensure_copy_file
-            ensure_copy_file(self.tsv_file, tmp_tsvfile)
-            ensure_copy_file(self.lineidx, tmp_lineidx)
-
-            self.tsv_file = tmp_tsvfile
-            self.lineidx = tmp_lineidx
-            # do not run the following. Supposedly, this function is called in
-            # init function. If we use multiprocess, the file handler will be
-            # duplicated and thus the seek will have some race condition if we
-            # have the following.
-            #self._fp = open(self.tsv_file, 'r')
-        elif self.cache_policy is not None:
-            raise ValueError('unkwown cache policy {}'.format(
-                self.cache_policy))
-
-    def get_tsv_fp(self):
-        start = time.time()
-        if self.use_fuse:
-            fp = self.fuser.open(self.tsv_file, 'r')
-        else:
-            if not self.open_once:
-                fp = exclusive_open_to_read(self.tsv_file)
-                self.open_once = True
-            else:
-                fp = open(self.tsv_file)
-        if self.use_mmap:
-            mfp = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
-        else:
-            mfp = fp
-        end = time.time()
-        if (end - start) > 10:
-            logger.info('too long ({}) to open {}'.format(
-                end - start, self.tsv_file))
-        return mfp, fp
-
-    def _ensure_tsv_opened(self):
-        if self.cache_policy == 'memory':
-            assert self._fp is not None
-            return
-
-        if self._fp is None:
-            self._mfp, self._fp = self.get_tsv_fp()
-            self.pid = os.getpid()
-
-        if self.pid != os.getpid():
-            self._mfp.close()
-            self._fp.close()
-            logger.info('re-open {} because the process id changed'.format(
-                self.tsv_file))
-            from .qd_common import print_opened_files
-            print_opened_files()
-            self._mfp, self._fp = self.get_tsv_fp()
-            self.pid = os.getpid()
-
-
-def get_all_associate_files(all_info):
-    result = []
-    for info in all_info:
-        result.extend(get_associate_files(info))
-    return result
-
-
-def get_associate_files(info):
-    data, split, t, version = info
-    dataset = TSVDataset(data)
-    fname = dataset.get_data(split, t, version)
-    result = []
-    if op.isfile(fname):
-        result.append(fname)
-    else:
-        result.extend(load_list_file(dataset.get_data(split + 'X', t,
-                                                      version)))
-        result.append(dataset.get_shuffle_file(split))
-    extra = [get_tsv_lineidx_8b(r) for r in result]
-    result.extend(extra)
-    return result
-
-
-class TSVDataset(object):
-
-    def __init__(self, name, data_root=None):
-        self.name = name
-        if data_root is None:
-            if os.environ.get('QD_DATA_ROOT') is not None:
-                data_root = os.environ['QD_DATA_ROOT']
-            else:
-                proj_root = op.dirname(
-                    op.dirname(op.dirname(op.realpath(__file__))))
-                data_root = op.join(proj_root, 'data')
-        data_root = op.join(data_root, name)
-        self._data_root = op.relpath(data_root)
-        self._fname_to_tsv = {}
-
-        self._split_to_key_to_idx = {}
-
-    def __repr__(self):
-        return 'TSVDataset({})'.format(self.name)
-
-    def __str__(self):
-        return 'TSVDataset({})'.format(self.name)
-
-    def seek_by_key(self, key, split, t=None, version=None):
-        idx = self.get_idx_by_key(key, split)
-        return next(self.iter_data(split, t, version, filter_idx=[idx]))
-
-    def seek_by_idx(self, idx, split, t=None, version=None):
-        return next(self.iter_data(split, t, version, filter_idx=[idx]))
-
-    def load_labelmap(self):
-        return load_list_file(self.get_labelmap_file())
-
-    def load_pos_labelmap(self):
-        return load_list_file(self.get_pos_labelmap_file())
-
-    def get_tree_file(self):
-        return op.join(self._data_root, 'tree.txt')
-
-    def get_labelmap_file(self):
-        return op.join(self._data_root, 'labelmap.txt')
-
-    def load_txt(self, t='labelmap'):
-        return load_list_file(self.get_txt(t))
-
-    # labelmap or attribute map
-    def get_txt(self, t='labelmap'):
-        return op.join(self._data_root, '{}.txt'.format(t))
-
-    def get_pos_labelmap_file(self):
-        return op.join(self._data_root, 'labelmap.pos.txt')
-
-    def get_train_shuffle_file(self):
-        return self.get_shuffle_file('train')
-
-    def get_shuffle_file(self, split_name):
-        return op.join(self._data_root, '{}.shuffle.txt'.format(split_name))
-
-    def get_labelmap_of_noffset_file(self):
-        return op.join(self._data_root, 'noffsets.label.txt')
-
-    def get_idx_by_key(self, key, split):
-        if split in self._split_to_key_to_idx:
-            key_to_idx = self._split_to_key_to_idx[split]
-        else:
-            key_to_idx = {k: i for i, k in enumerate(self.load_keys(split))}
-            self._split_to_key_to_idx[split] = key_to_idx
-        idx = key_to_idx[key]
-        return idx
-
-    def load_key_to_idx(self, split):
-        result = {}
-        for i, row in enumerate(self.iter_data(split, 'label')):
-            key = row[0]
-            assert key not in result
-            result[key] = i
-        return result
-
-    def load_keys(self, split, t='label'):
-        assert self.has(split, t)
-        result = []
-        for row in tqdm(self.iter_data(split, t), mininterval=2):
-            result.append(row[0])
-        return result
-
-    def dynamic_update(self, dataset_ops):
-        '''
-        sometimes, we update the dataset, and here, we should update the file
-        path
-        '''
-        if len(dataset_ops) >= 1 and dataset_ops[0]['op'] == 'sample':
-            self._data_root = op.join(
-                './output/data/',
-                '{}_{}_{}'.format(self.name, dataset_ops[0]['sample_label'],
-                                  dataset_ops[0]['sample_image']))
-        elif len(dataset_ops
-                 ) >= 1 and dataset_ops[0]['op'] == 'mask_background':
-            target_folder = op.join(
-                './output/data', '{}_{}_{}'.format(
-                    self.name,
-                    '.'.join(map(str, dataset_ops[0]['old_label_idx'])),
-                    dataset_ops[0]['new_label_idx']))
-            self._data_root = target_folder
-
-    def get_test_tsv_file(self, t=None):
-        return self.get_data('test', t)
-
-    def get_test_tsv_lineidx_file(self):
-        return op.join(self._data_root, 'test.lineidx')
-
-    def get_train_tsvs(self, t=None):
-        if op.isfile(self.get_data('train', t)):
-            return [self.get_data('train', t)]
-        trainx_file = op.join(self._data_root, 'trainX.tsv')
-        if not op.isfile(trainx_file):
-            return []
-        train_x = load_list_file(trainx_file)
-        if t is None:
-            return train_x
-        elif t == 'label':
-            if op.isfile(self.get_data('trainX', 'label')):
-                return load_list_file(self.get_data('trainX', 'label'))
-            else:
-                files = [op.splitext(f)[0] + '.label.tsv' for f in train_x]
-                return files
-
-    def get_train_tsv(self, t=None):
-        return self.get_data('train', t)
-
-    def get_lineidx(self, split_name):
-        return op.join(self._data_root, '{}.lineidx'.format(split_name))
-
-    def get_latest_version(self, split, t=None):
-        assert t is not None, 'if it is none, it is always 0'
-        v = 0
-        if t is None:
-            pattern = op.join(self._data_root, '{}.v*.tsv'.format(split))
-            re_pattern = '{}\.v([0-9]*)\.tsv'.format(split)
-        else:
-            pattern = op.join(self._data_root, '{}.{}.v*.tsv'.format(split, t))
-            re_pattern = '{}\.{}\.v([0-9]*)\.tsv'.format(split, t)
-        all_file = glob.glob(pattern)
-        import re
-        re_results = [re.match(re_pattern, op.basename(f)) for f in all_file]
-        candidates = ([
-            int(re_result.groups()[0])
-            for re_result, f in zip(re_results, all_file) if re_result
-        ])
-        if len(candidates) > 0:
-            v = max(candidates)
-        assert v >= 0
-        return v
-
-    def get_gen_info_data(self, split, t=None, version=None):
-        return self.get_data(split,
-                             '{}.generate.info'.format(t),
-                             version=version)
-
-    def get_file(self, fname):
-        return op.join(self._data_root, fname)
-
-    def get_data(self, split_name, t=None, version=None):
-        '''
-        e.g. split_name = train, t = label
-        if version = None or 0,  return train.label.tsv
-        we don't have train.label.v0.tsv
-        if version = 3 > 0, return train.label.v3.tsv
-        if version = -1, return the highest version
-        '''
-        if t is None:
-            # in this case, it is an image split, which has no version
-            version = None
-        if version is None or version in [0, 'None', '0']:
-            if t is None:
-                return op.join(self._data_root, '{}.tsv'.format(split_name))
-            else:
-                return op.join(self._data_root,
-                               '{}.{}.tsv'.format(split_name, t))
-        elif version == -1:
-            if not op.isfile(self.get_data(split_name, t)):
-                return self.get_data(split_name, t)
-            v = self.get_latest_version(split_name, t)
-            return self.get_data(split_name, t, v)
-        else:
-            return op.join(self._data_root,
-                           '{}.{}.v{}.tsv'.format(split_name, t, version))
-
-    def get_num_train_image(self):
-        if op.isfile(self.get_data('trainX')):
-            if op.isfile(self.get_shuffle_file('train')):
-                return len(load_list_file(self.get_shuffle_file('train')))
-            else:
-                return 0
-        else:
-            return len(
-                load_list_file(op.join(self._data_root, 'train.lineidx')))
-
-    def get_trainval_tsv(self, t=None):
-        return self.get_data('trainval', t)
-
-    def get_noffsets_file(self):
-        return op.join(self._data_root, 'noffsets.txt')
-
-    def load_noffsets(self):
-        logger.info('deprecated: pls generate it on the fly')
-        return load_list_file(self.get_noffsets_file())
-
-    def load_inverted_label(self, split, version=None, label=None):
-        fname = self.get_data(split, 'inverted.label', version)
-        if not op.isfile(fname):
-            return {}
-        elif label is None:
-            tsv = TSVFile(fname)
-            num_rows = len(tsv)
-            result = {}
-            for row in tqdm(tsv, total=num_rows, mininterval=2):
-                assert row[0] not in result
-                assert len(row) == 2
-                ss = row[1].split(' ')
-                if len(ss) == 1 and ss[0] == '':
-                    result[row[0]] = []
-                else:
-                    result[row[0]] = list(map(int, ss))
-            return result
-        else:
-            all_label = load_list_file(
-                self.get_data(split, 'labelmap', version))
-            if label not in all_label:
-                return {}
-            result = {}
-            idx = all_label.index(label)
-            tsv = self._retrieve_tsv(fname)
-            row = tsv.seek(idx)
-            assert row[0] == label
-            ss = row[1].split(' ')
-            if len(ss) == 1 and ss[0] == '':
-                result[row[0]] = []
-            else:
-                result[row[0]] = list(map(int, ss))
-            return result
-
-    def load_inverted_label_as_list(self, split, version=None, label=None):
-        fname = self.get_data(split, 'inverted.label', version)
-        if not op.isfile(fname):
-            return []
-        elif label is None:
-            rows = tsv_reader(fname)
-            result = []
-            for row in rows:
-                assert len(row) == 2
-                ss = row[1].split(' ')
-                if len(ss) == 1 and ss[0] == '':
-                    result.append((row[0], []))
-                else:
-                    result.append((row[0], list(map(int, ss))))
-            return result
-        else:
-            all_label = self.load_labelmap()
-            result = []
-            idx = all_label.index(label)
-            tsv = self._retrieve_tsv(fname)
-            row = tsv.seek(idx)
-            assert row[0] == label
-            ss = row[1].split(' ')
-            if len(ss) == 1 and ss[0] == '':
-                result.append((row[0], []))
-            else:
-                result.append((row[0], list(map(int, ss))))
-            return result
-
-    def has(self, split, t=None, version=None):
-        return op.isfile(self.get_data(split, t, version)) or (
-            op.isfile(self.get_data('{}X'.format(split), t, version))
-            and op.isfile(self.get_shuffle_file(split)))
-
-    def last_update_time(self, split, t=None, version=None):
-        tsv_file = self.get_data(split, t, version)
-        if op.isfile(tsv_file):
-            return os.path.getmtime(tsv_file)
-        assert version is None or version == 0, 'composite dataset always v=0'
-        tsv_file = self.get_data('{}X'.format(split), t, version)
-        assert op.isfile(tsv_file)
-        return os.path.getmtime(tsv_file)
-
-    def iter_composite(self, split, t, version, filter_idx=None):
-        splitX = split + 'X'
-        file_list = load_list_file(self.get_data(splitX, t, version))
-        tsvs = [self._retrieve_tsv(f) for f in file_list]
-        shuffle_file = self.get_shuffle_file(split)
-        if filter_idx is None:
-            shuffle_tsv_rows = tsv_reader(shuffle_file)
-            for idx_source, idx_row in shuffle_tsv_rows:
-                idx_source, idx_row = int(idx_source), int(idx_row)
-                row = tsvs[idx_source].seek(idx_row)
-                if len(row) == 3:
-                    row[1] == 'dont use'
-                yield row
-        else:
-            shuffle_tsv = self._retrieve_tsv(shuffle_file)
-            for i in filter_idx:
-                idx_source, idx_row = shuffle_tsv.seek(i)
-                idx_source, idx_row = int(idx_source), int(idx_row)
-                row = tsvs[idx_source].seek(idx_row)
-                if len(row) == 3:
-                    row[1] == 'dont use'
-                yield row
-
-    def num_rows(self, split, t=None, version=None):
-        f = self.get_data(split, t, version)
-        if op.isfile(f) or op.islink(f):
-            return TSVFile(f).num_rows()
-        else:
-            f = self.get_data(split + 'X', version=version)
-            assert op.isfile(f), f
-            return len(load_list_file(self.get_shuffle_file(split)))
-
-    def iter_data(self,
-                  split,
-                  t=None,
-                  version=None,
-                  unique=False,
-                  filter_idx=None,
-                  progress=False):
-        if progress:
-            if filter_idx is None:
-                num_rows = self.num_rows(split)
-            else:
-                num_rows = len(filter_idx)
-            pbar = progressbar.ProgressBar(maxval=num_rows).start()
-        splitX = split + 'X'
-        if not op.isfile(self.get_data(split, t, version)) and \
-                op.isfile(self.get_data(splitX, t, version)):
-            if t is not None:
-                if unique:
-                    returned = set()
-                for i, row in enumerate(
-                        self.iter_composite(split,
-                                            t,
-                                            version,
-                                            filter_idx=filter_idx)):
-                    if unique and row[0] in returned:
-                        continue
-                    else:
-                        yield row
-                        if unique:
-                            returned.add(row[0])
-                    if progress:
-                        pbar.update(i)
-            else:
-                rows_data = self.iter_composite(split,
-                                                None,
-                                                version=version,
-                                                filter_idx=filter_idx)
-                #logger.info('breaking change: label is ignore for t=None')
-                #rows_label = self.iter_data(split, 'label', version=version,
-                #filter_idx=filter_idx)
-                if unique:
-                    returned = set()
-                for i, r in enumerate(rows_data):
-                    if unique and r[0] in returned:
-                        continue
-                    else:
-                        yield r
-                        if unique:
-                            returned.add(r[0])
-                    if progress:
-                        pbar.update(i)
-        else:
-            fname = self.get_data(split, t, version)
-            if not op.isfile(fname):
-                logger.info('no {}'.format(fname))
-                return
-            if filter_idx is None:
-                for i, row in enumerate(
-                        tsv_reader(self.get_data(split, t, version))):
-                    yield row
-                    if progress:
-                        pbar.update(i)
-            else:
-                fname = self.get_data(split, t, version)
-                tsv = self._retrieve_tsv(fname)
-                if progress:
-                    for i in tqdm(filter_idx):
-                        yield tsv.seek(i)
-                else:
-                    for i in filter_idx:
-                        yield tsv.seek(i)
-
-    def _retrieve_tsv(self, fname):
-        if fname in self._fname_to_tsv:
-            tsv = self._fname_to_tsv[fname]
-        else:
-            tsv = TSVFile(fname)
-            self._fname_to_tsv[fname] = tsv
-        return tsv
-
-    def safe_write_data(self,
-                        rows,
-                        split,
-                        t=None,
-                        version=None,
-                        generate_info=None,
-                        force=False):
-        assert force or not self.has(split, t, version)
-        if generate_info is None:
-            from .qd_common import get_frame_info
-            info = get_frame_info(last=1)
-
-            def gen_info():
-                for k, v in info.items():
-                    if isinstance(v, str):
-                        yield k, v
-
-            generate_info = gen_info()
-        self.write_data(rows, split, t, version, generate_info=generate_info)
-
-    def write_data(self,
-                   rows,
-                   split,
-                   t=None,
-                   version=None,
-                   generate_info=None):
-        out_tsv = self.get_data(split, t, version)
-        tsv_writer(rows, out_tsv)
-        if generate_info is not None:
-            out_tsv = self.get_data(split,
-                                    '{}.generate.info'.format(t),
-                                    version=version)
-            tsv_writer(generate_info, out_tsv)
-
-    def update_data(self, rows, split, t, generate_info=None):
-        '''
-        if the data are the same, we will not do anything.
-        '''
-        assert t is not None
-        v = self.get_latest_version(split, t)
-        if self.has(split, t, v):
-            is_equal = True
-            # we first save it to a tmp tsv file
-            self.write_data(rows, split, t + '.tmp', v + 1)
-            for origin_row, new_row in zip(
-                    self.iter_data(split, t, v),
-                    self.iter_data(split, t + '.tmp', v + 1)):
-                if len(origin_row) != len(new_row):
-                    is_equal = False
-                    break
-                for o, n in zip(origin_row, new_row):
-                    if o != n:
-                        is_equal = False
-                        break
-                if not is_equal:
-                    break
-            if not is_equal:
-                logger.info('creating {} for {}'.format(v + 1, self.name))
-                if generate_info:
-                    self.write_data(generate_info, split,
-                                    '{}.generate.info'.format(t), v + 1)
-                tsv_mv(self.get_data(split, t + '.tmp', v + 1),
-                       self.get_data(split, t, v + 1))
-                return v + 1
-            else:
-                logger.info(
-                    'ignore to create since the label matches the latest')
-        else:
-            assert v == 0
-            v = -1
-            logger.info('creating {} for {}'.format(v + 1, self.name))
-            if generate_info:
-                self.write_data(generate_info, split,
-                                '{}.generate.info'.format(t), v + 1)
-            self.write_data(rows, split, t, version=v + 1)
-            return v + 1
-
-    def load_composite_source_data_split(self, split):
-        splitX = split + 'X'
-        pattern = 'data/(.*)/(.*)\.tsv'
-        tsv_sources = [l for l, in tsv_reader(self.get_data(splitX))]
-        matched_result = [re.match(pattern, l).groups() for l in tsv_sources]
-
-        return [(d, s) for d, s in matched_result]
-
-    def load_composite_source_data_split_versions(self, split):
-        # this function is only valid if we generated the composite dataset
-        # from tsv, not from db. if it is from db, there is no file of
-        # origin.label. use load_composite_source_data_split, instead.
-        splitX = split + 'X'
-        pattern = 'data/(.*)/(train|trainval|test)\.label\.v(.*)\.tsv'
-        tsv_sources = [
-            l for l, in tsv_reader(self.get_data(splitX, 'origin.label'))
-        ]
-        matched_result = [re.match(pattern, l).groups() for l in tsv_sources]
-
-        return [(d, s, int(v)) for d, s, v in matched_result]
-
-
-def csv_writer(values, file_name):
-    tsv_writer(values, file_name, sep=',')
-    return
-
-
-class TSVSplitProperty(object):
-    '''
-    one instance of this class mean one tsv file or one composite tsv, it could
-    be label tsv, or hw tsv, or image tsv
-    '''
-
-    def __init__(self,
-                 data,
-                 split,
-                 t=None,
-                 version=0,
-                 cache_policy=None,
-                 hold_buffer=0):
-        self.data = data
-        self.split = split
-        self.t = t
-        self.version = version
-        dataset = TSVDataset(data)
-        single_tsv = dataset.get_data(split, t, version)
-        #is_single_tsv = op.isfile(single_tsv)
-        is_single_tsv = QDFile.isfile(single_tsv)
-        if is_single_tsv:
-            self.tsv = TSVFile(dataset.get_data(split, t, version),
-                               cache_policy)
-        else:
-            splitX = split + 'X'
-            list_file = dataset.get_data(splitX, t, version=version)
-            seq_file = dataset.get_shuffle_file(split)
-            assert QDFile.isfile(list_file) and QDFile.isfile(seq_file), (
-                '{}, {}/{} not available'.format(single_tsv, list_file,
-                                                 seq_file))
-            self.tsv = CompositeTSVFile(list_file,
-                                        seq_file,
-                                        cache_policy,
-                                        hold_buffer=hold_buffer)
-
-    def get_row_len(self, i):
-        return self.tsv.get_row_len(i)
-
-    def __repr__(self):
-        return 'TSVSplitProperty(tsv={})'.format(self.tsv)
-
-    def __getitem__(self, index):
-        row = self.tsv[index]
-        return row
-
-    def __len__(self):
-        return len(self.tsv)
-
-    def num_rows(self):
-        return len(self)
-
-    def close(self):
-        self.tsv.close()
-
-    def __iter__(self):
-        return iter(self.tsv)
-
-    def get_key(self, i):
-        return self.tsv.seek_first_column(i)
-
-    def seek_first_column(self, idx):
-        return self.tsv.seek_first_column(idx)
-
-    def get_composite_source_idx(self):
-        return self.tsv.get_composite_source_idx()
-
-
-def tsv_writers(all_values, tsv_file_names, sep='\t'):
-    # values: a list of [row1, row2]. each row goes to each tsv_file_name
-    for tsv_file_name in tsv_file_names:
-        ensure_directory(os.path.dirname(tsv_file_name))
-    tsv_lineidx_files = [
-        os.path.splitext(tsv_file_name)[0] + '.lineidx'
-        for tsv_file_name in tsv_file_names
-    ]
-    tsv_lineidx_8b_files = [x + '.8b' for x in tsv_lineidx_files]
-    tsv_lineidx_8b_file_tmps = [x + '.tmp' for x in tsv_lineidx_8b_files]
-    tsv_file_name_tmps = [
-        tsv_file_name + '.tmp' for tsv_file_name in tsv_file_names
-    ]
-    tsv_lineidx_file_tmps = [
-        tsv_lineidx_file + '.tmp' for tsv_lineidx_file in tsv_lineidx_files
-    ]
-    sep = sep.encode()
-    assert all_values is not None
-    fps = [
-        open(tsv_file_name_tmp, 'wb')
-        for tsv_file_name_tmp in tsv_file_name_tmps
-    ]
-    fpidxs = [
-        open(tsv_lineidx_file_tmp, 'w')
-        for tsv_lineidx_file_tmp in tsv_lineidx_file_tmps
-    ]
-    fpidx8bs = [open(x, 'wb') for x in tsv_lineidx_8b_file_tmps]
-    idxs = [0 for _ in fps]
-    for values in all_values:
-        assert values is not None
-        for i, (value, fp, fpidx,
-                fpidx8b) in enumerate(zip(values, fps, fpidxs, fpidx8bs)):
-            value = map(lambda v: v
-                        if type(v) == bytes else str(v).encode(), value)
-            v = sep.join(value) + b'\n'
-            fp.write(v)
-            fpidx.write(str(idxs[i]) + '\n')
-            fpidx8b.write(idxs[i].to_bytes(8, 'little'))
-            idxs[i] = idxs[i] + len(v)
-    for f in fps:
-        f.close()
-    for f in fpidxs:
-        f.close()
-    for f in fpidx8bs:
-        f.close()
-    # the following might crash if there are two processes which are writing at
-    # the same time. One process finishes the renaming first and the second one
-    # will crash. In this case, we know there must be some errors when you run
-    # the code, and it should be a bug to fix rather than to use try-catch to
-    # protect it here.
-    for tsv_file_name_tmp, tsv_file_name in zip(tsv_file_name_tmps,
-                                                tsv_file_names):
-        os.rename(tsv_file_name_tmp, tsv_file_name)
-    for tsv_lineidx_file_tmp, tsv_lineidx_file in zip(tsv_lineidx_file_tmps,
-                                                      tsv_lineidx_files):
-        os.rename(tsv_lineidx_file_tmp, tsv_lineidx_file)
-    for x, y in zip(
-            tsv_lineidx_8b_file_tmps,
-            tsv_lineidx_8b_files,
-    ):
-        os.rename(x, y)
-
-
-def tsv_writer(values, tsv_file_name, sep='\t'):
-    ensure_directory(os.path.dirname(tsv_file_name))
-    tsv_lineidx_file = os.path.splitext(tsv_file_name)[0] + '.lineidx'
-    tsv_8b_file = tsv_lineidx_file + '.8b'
-    idx = 0
-    tsv_file_name_tmp = tsv_file_name + '.tmp'
-    tsv_lineidx_file_tmp = tsv_lineidx_file + '.tmp'
-    tsv_8b_file_tmp = tsv_8b_file + '.tmp'
-    import sys
-    is_py2 = sys.version_info.major == 2
-    if not is_py2:
-        sep = sep.encode()
-    with open(tsv_file_name_tmp,
-              'wb') as fp, open(tsv_lineidx_file_tmp,
-                                'w') as fpidx, open(tsv_8b_file_tmp,
-                                                    'wb') as fp8b:
-        assert values is not None
-        for value in values:
-            assert value is not None
-            if is_py2:
-                v = sep.join(
-                    map(
-                        lambda v: v.encode('utf-8')
-                        if isinstance(v, unicode) else str(v), value)) + '\n'
-            else:
-                value = map(
-                    lambda v: v
-                    if type(v) == bytes else str(v).encode(), value)
-                v = sep.join(value) + b'\n'
-            fp.write(v)
-            fpidx.write(str(idx) + '\n')
-            # although we can use sys.byteorder to retrieve the system-default
-            # byte order, let's use little always to make it consistent and
-            # simple
-            fp8b.write(idx.to_bytes(8, 'little'))
-            idx = idx + len(v)
-    # the following might crash if there are two processes which are writing at
-    # the same time. One process finishes the renaming first and the second one
-    # will crash. In this case, we know there must be some errors when you run
-    # the code, and it should be a bug to fix rather than to use try-catch to
-    # protect it here.
-    os.rename(tsv_file_name_tmp, tsv_file_name)
-    os.rename(tsv_lineidx_file_tmp, tsv_lineidx_file)
-    os.rename(tsv_8b_file_tmp, tsv_8b_file)
-
-
-def tsv_reader(tsv_file_name, sep='\t'):
-    with open(tsv_file_name, 'r') as fp:
-        for line in fp:
-            yield [x.strip() for x in line.split(sep)]
-
-
-def csv_reader(tsv_file_name):
-    return tsv_reader(tsv_file_name, ',')
-
-
-def get_meta_file(tsv_file):
-    return op.splitext(tsv_file)[0] + '.meta.yaml'
-
-
-def extract_label(full_tsv, label_tsv):
-    if op.isfile(label_tsv):
-        logger.info('label file exists and will skip to generate: {}'.format(
-            label_tsv))
-        return
-    if not op.isfile(full_tsv):
-        logger.info('the file of {} does not exist'.format(full_tsv))
-        return
-    rows = tsv_reader(full_tsv)
-
-    def gen_rows():
-        for i, row in enumerate(rows):
-            if (i % 1000) == 0:
-                logger.info('extract_label: {}-{}'.format(full_tsv, i))
-            del row[2]
-            assert len(row) == 2
-            assert type(row[0]) == str
-            assert type(row[1]) == str
-            yield row
-
-    tsv_writer(gen_rows(), label_tsv)
-
-
-def create_inverted_tsv(rows, inverted_label_file, label_map):
-    '''
-    deprecated, use create_inverted_list
-    save the results based on the label_map in label_map_file. The benefit is
-    to seek the row given a label
-    '''
-    inverted = {}
-    for i, row in enumerate(rows):
-        labels = json.loads(row[1])
-        if type(labels) is list:
-            # detection dataset
-            curr_unique_labels = set([l['class'] for l in labels])
-        else:
-            assert type(labels) is int
-            curr_unique_labels = [label_map[labels]]
-        for l in curr_unique_labels:
-            assert type(l) == str or type(l) == unicode
-            if l not in inverted:
-                inverted[l] = [i]
-            else:
-                inverted[l].append(i)
-
-    def gen_rows():
-        for label in inverted:
-            assert label in label_map
-        for label in label_map:
-            i = inverted[label] if label in inverted else []
-            yield label, ' '.join(map(str, i))
-
-    tsv_writer(gen_rows(), inverted_label_file)
-
-
-def create_inverted_list2(rows, th=None):
-    inverted = {}
-    keys = []
-    for i, row in enumerate(rows):
-        keys.append(row[0])
-        labels = json.loads(row[1])
-        if th is not None:
-            labels = [
-                r for r in labels
-                if 'conf' in r and r['conf'] > th or 'conf' not in r
-            ]
-        if type(labels) is list:
-            # detection dataset
-            curr_unique_labels = set([l['class'] for l in labels])
-        else:
-            assert type(labels) is int
-            curr_unique_labels = [str(labels)]
-        for l in curr_unique_labels:
-            assert type(l) == str or type(l) == unicode
-            if l not in inverted:
-                inverted[l] = [i]
-            else:
-                inverted[l].append(i)
-    return inverted, keys
-
-
-def is_verified_rect(rect):
-    #allowed_keys = set(['class', 'rect', 'uhrs_confirm', 'uhrs_uncertain',
-    #'conf', 'merge_from', 'class_from', 'change_from', 'from', 'diff',
-    #'IsInside', 'IsGroupOf', 'IsDepiction', 'IsOccluded',
-    #'IsTruncated', 'workerId', 'class_propagate_from', 'obj', 'uhrs'])
-    #unknown_keys = [k for k in rect if k not in allowed_keys]
-    #if len(unknown_keys) > 0:
-    #logger.info('unknown keys = {}\n'.format(pformat(unknown_keys)))
-    #pass
-
-    if 'uhrs' in rect:
-        judge_result = rect['uhrs']
-        assert judge_result.get('1', 0) >= judge_result.get('2', 0)
-        return True
-
-    if 'class' not in rect or 'rect' not in rect:
-        return False
-
-    if 'uhrs_confirm' in rect:
-        assert rect['uhrs_confirm'] > 0
-        return True
-
-    if 'conf' in rect and rect['conf'] < 1:
-        return False
-
-    if 'merge_from' in rect:
-        return all(is_verified_rect(r) for r in rect['merge_from'])
-
-    return True
-
-
-def create_inverted_list(rows):
-    inverted = {}
-    inverted_with_bb = {}
-    inverted_no_bb = {}
-    inverted_with_bb_verified = {}
-    inverted_with_bb_noverified = {}
-    logger.info('creating inverted')
-    for i, row in tqdm(enumerate(rows), mininterval=2):
-        labels = json.loads(row[1])
-        if type(labels) is list:
-            # detection dataset
-            curr_unique_labels = set([l['class'] for l in labels])
-            curr_unique_with_bb_labels = set([
-                l['class'] for l in labels
-                if 'rect' in l and any(x != 0 for x in l['rect'])
-            ])
-            curr_unique_no_bb_labels = set([
-                l['class'] for l in labels
-                if 'rect' not in l or all(x == 0 for x in l['rect'])
-            ])
-            curr_unique_with_bb_verified_labels = set([
-                l['class'] for l in labels if 'rect' in l and any(
-                    x != 0 for x in l['rect']) and is_verified_rect(l)
-            ])
-            curr_unique_with_bb_noverified_labels = set([
-                l['class'] for l in labels if 'rect' in l and any(
-                    x != 0 for x in l['rect']) and not is_verified_rect(l)
-            ])
-        else:
-            assert type(labels) is int
-            curr_unique_labels = [str(labels)]
-            curr_unique_with_bb_labels = []
-            curr_unique_no_bb_labels = curr_unique_labels
-            curr_unique_with_bb_verified_labels = set()
-            curr_unique_with_bb_noverified_labels = set()
-
-        def update(unique_labels, inv):
-            for l in unique_labels:
-                assert type(l) == str
-                if l not in inv:
-                    inv[l] = [i]
-                else:
-                    inv[l].append(i)
-
-        update(curr_unique_labels, inverted)
-        update(curr_unique_with_bb_labels, inverted_with_bb)
-        update(curr_unique_no_bb_labels, inverted_no_bb)
-        update(curr_unique_with_bb_verified_labels, inverted_with_bb_verified)
-        update(curr_unique_with_bb_noverified_labels,
-               inverted_with_bb_noverified)
-    return {
-        'inverted.label': inverted,
-        'inverted.label.with_bb': inverted_with_bb,
-        'inverted.label.no_bb': inverted_no_bb,
-        'inverted.label.with_bb.verified': inverted_with_bb_verified,
-        'inverted.label.with_bb.noverified': inverted_with_bb_noverified
-    }
-
-
-def tsv_shuffle_reader(tsv_file):
-    logging.warn('deprecated: using TSVFile to randomly seek')
-    lineidx_file = op.splitext(tsv_file)[0] + '.lineidx'
-    lineidx = load_list_file(lineidx_file)
-    random.shuffle(lineidx)
-    with open(tsv_file, 'r') as fp:
-        for l in lineidx:
-            fp.seek(int(float(l)))
-            yield [x.strip() for x in fp.readline().split('\t')]
-
-
-def load_labelmap(data):
-    dataset = TSVDataset(data)
-    return dataset.load_labelmap()
-
-
-def get_caption_data_info(name):
-    dataset = TSVDataset(name)
-    splits = get_default_splits()
-    from collections import defaultdict
-    split_to_versions = defaultdict(list)
-    for split in splits:
-        v = 0
-        while True:
-            if not dataset.has(split, 'caption', v):
-                break
-            split_to_versions[split].append(v)
-            v = v + 1
-    return split_to_versions
-
-
-def get_all_data_info2(name=None):
-    if name is None:
-        return sorted(os.listdir('./data'))
-    else:
-        dataset = TSVDataset(name)
-        valid_split_versions = []
-        splits = get_default_splits()
-
-        for split in splits:
-            v = 0
-            while True:
-                if not dataset.has(split, 'label', v):
-                    break
-                if dataset.has(split, 'inverted.label.count', v):
-                    label_count_rows = dataset.iter_data(
-                        split, 'inverted.label.count', v)
-                    label_count = [(r[0], int(r[1])) for r in label_count_rows]
-                    label_count = sorted(label_count, key=lambda x: x[1])
-                else:
-                    label_count = []
-                valid_split_versions.append(
-                    (split, v, "", [(i, l, c)
-                                    for i, (l, c) in enumerate(label_count)]))
-                v = v + 1
-        name_splits_labels = [(name, valid_split_versions)]
-        return name_splits_labels
-
-
-def get_all_data_info():
-    names = os.listdir('./data')
-    name_splits_labels = []
-    names.sort(key=lambda n: n.lower())
-    for name in names:
-        dataset = TSVDataset(name)
-        if not op.isfile(dataset.get_labelmap_file()):
-            continue
-        labels = dataset.load_labelmap()
-        valid_splits = []
-        if len(dataset.get_train_tsvs()) > 0:
-            valid_splits.append('train')
-        for split in ['trainval', 'test']:
-            if not op.isfile(dataset.get_data(split)):
-                continue
-            valid_splits.append(split)
-        name_splits_labels.append((name, valid_splits, labels))
-    return name_splits_labels
-
-
-def load_labels(file_name):
-    rows = tsv_reader(file_name)
-    key_to_rects = {}
-    key_to_idx = {}
-    for i, row in enumerate(rows):
-        key = row[0]
-        rects = json.loads(row[1])
-        #assert key not in labels, '{}-{}'.format(file_name, key)
-        key_to_rects[key] = rects
-        key_to_idx[key] = i
-    return key_to_rects, key_to_idx
-
-
-def azcopy_read(fname):
-    # we ignore fname since it could be a mounted blobfuse folder in AML
-    local_file_name = op.join(
-        '/tmp', '{}_{}'.format(
-            get_user_name(),
-            hash_sha1(op.realpath(op.abspath(fname))) + op.splitext(fname)[1]))
-    if op.isfile(local_file_name):
-        return open(local_file_name, 'r')
-    config_file = os.environ['FILE_OPEN_AZCOPY_BLOB_ACCOUNT_PATH']
-    from .cloud_storage import create_cloud_storage
-    remote_path = op.join(
-        os.environ['FILE_OPEN_AZCOPY_REMOTE_PATH'],
-        op.relpath(fname, os.environ['FILE_OPEN_AZCOPY_LOCAL_PATH']))
-    c = create_cloud_storage(config_file=config_file)
-    logger.info('downloading from {} to {} for {}'.format(
-        remote_path, local_file_name, fname))
-    c.az_download(remote_path, local_file_name)
-    return open(local_file_name, 'r')
-
-
-def load_list_file(fname):
-    # prefer this than .qd_common.load_list_file
-    with QDFile.open(fname) as fp:
-        lines = fp.readlines()
-    result = [line.strip() for line in lines]
-    if len(result) > 0 and result[-1] == '':
-        result = result[:-1]
-    return result
-
-
-def generate_lineidx8b_from_lineidx(lineidx, lineidx_8b):
-    tsv_8b_file_tmp = lineidx_8b + '.tmp'
-    logger.info(lineidx)
-    with open(tsv_8b_file_tmp, 'wb') as fp8b:
-        for i, in tqdm(tsv_reader(lineidx)):
-            fp8b.write(int(i).to_bytes(8, 'little'))
-    os.rename(tsv_8b_file_tmp, lineidx_8b)
-
-
-def convert_data_to_yaml(data,
-                         split,
-                         yaml,
-                         is_train=True,
-                         label=None,
-                         feature=None,
-                         qd_format=False,
-                         label_version=None,
-                         feature_version=None):
-    # used for captioning-related scripts
-    if qd_format:
-        info = {
-            'feature': feature if feature is not None else {
-                'data': data,
-                'split': split,
-                't': 'feature',
-                'version': feature_version,
-            },
-            'hw': {
-                'data': data,
-                'split': split,
-                't': 'hw'
-            },
-            'img': {
-                'data': data,
-                'split': split
-            },
-            'label': label if label is not None else {
-                'data': data,
-                'split': split,
-                't': 'label',
-                'version': label_version,
-            },
-            'caption': {
-                'data': data,
-                'split': split,
-                't': 'hw'
-            },
-            'composite': False,
-            'qd_format': True,
-        }
-    else:
-        assert label is None and feature is None
-        # will be deprecated
-        from .tsv_io import TSVDataset
-        yaml_folder = op.dirname(yaml)
-        dataset = TSVDataset(data)
-        if not op.isfile(dataset.get_data(split + 'X')):
-            # we prefer to use the composite
-            info = {
-                'feature':
-                op.relpath(
-                    dataset.get_data('train',
-                                     'feature',
-                                     version=feature_version), yaml_folder),
-                'label':
-                op.relpath(
-                    dataset.get_data(split, 'label', version=label_version),
-                    yaml_folder),
-                'hw':
-                op.relpath(dataset.get_data(split, 'hw'), yaml_folder),
-                'img':
-                op.relpath(dataset.get_data(split), yaml_folder),
-                'caption':
-                op.relpath(dataset.get_data(split, 'caption'), yaml_folder),
-                'composite':
-                False,
-            }
-        else:
-
-            def get_rel_path(p):
-                return op.relpath(op.realpath(p), op.realpath(yaml_folder))
-
-            splitX = split + 'X'
-            from .qd_common import load_list_file
-            info = {
-                'feature':
-                list(
-                    map(
-                        get_rel_path,
-                        load_list_file(
-                            dataset.get_data(splitX,
-                                             'feature',
-                                             version=feature_version)))),
-                'label':
-                list(
-                    map(
-                        get_rel_path,
-                        load_list_file(
-                            dataset.get_data(splitX,
-                                             'label',
-                                             version=label_version)))),
-                'hw':
-                list(
-                    map(get_rel_path,
-                        load_list_file(dataset.get_data(splitX, 'hw')))),
-                'img':
-                list(
-                    map(get_rel_path,
-                        load_list_file(dataset.get_data(splitX)))),
-                'caption':
-                list(
-                    map(get_rel_path,
-                        load_list_file(dataset.get_data(splitX, 'caption')))),
-                'composite':
-                True,
-            }
-            if is_train:
-                caption_linelist = dataset.get_data(split, 'caption_linelist')
-                assert op.isfile(caption_linelist)
-                info['caption_linelist'] = caption_linelist
-            else:
-                caption_linelist = dataset.get_data(split,
-                                                    'caption_linelist_test')
-                if not op.isfile(caption_linelist):
-                    from .tsv_io import tsv_reader
-                    tsv_writer(((a, b, 0) for a, b in tsv_reader(
-                        dataset.get_shuffle_file(split))), caption_linelist)
-                info['caption_linelist'] = caption_linelist
-    from .qd_common import write_to_yaml_file
-    write_to_yaml_file(info, yaml)
-
-
-class QDFile(object):
-    initialized = False
-    use_fuser = False
-    fuser = None
-
-    @classmethod
-    def ensure_initialized(cls):
-        if not cls.initialized:
-            cls.initialized = True
-            cls.use_fuser = int(os.environ.get('QD_TSV_USE_FUSE', '0'))
-            if cls.use_fuser:
-                from .cloud_storage import create_cloud_fuse
-                cls.fuser = create_cloud_fuse()
-                fns = os.environ.get('QD_USE_FUSE_CACHE_AT_INIT')
-                if fns is not None:
-                    fns = fns.split(',')
-                    cls.fuser.ensure_cache(fns)
-
-    @classmethod
-    def isfile(cls, fname):
-        cls.ensure_initialized()
-        if cls.use_fuser:
-            return cls.fuser.isfile(fname)
-        else:
-            return op.isfile(fname)
-
-    @classmethod
-    def open(cls, fname, mode='r'):
-        cls.ensure_initialized()
-        if cls.use_fuser:
-            return cls.fuser.open(fname, mode)
-        else:
-            return exclusive_open_to_read(fname, mode)
-
-    @classmethod
-    def get_file_size(cls, fname):
-        cls.ensure_initialized()
-        if cls.use_fuser:
-            return cls.fuser.get_file_size(fname)
-        else:
-            return get_file_size(fname)
-
-    @classmethod
-    def prepare(cls, file_or_fnames):
-        if isinstance(file_or_fnames, str):
-            file_or_fnames = [file_or_fnames]
-        fnames = file_or_fnames
-        cls.ensure_initialized()
-        if cls.use_fuser:
-            cls.fuser.ensure_cache(fnames)
diff --git a/README.md b/README.md
index 8039589..ce695b8 100644
--- a/README.md
+++ b/README.md
@@ -6,87 +6,8 @@
 
 This repository provides the official implementation for the CVPR2023 paper "Fine-grained Audible Video Description". 
 We build a novel task: **FAVD** and a new dataset: **FAVDBench** in this paper.  
-
-<p float="left">
-  <img src="images/task_intro.png?raw=true" width="86.7%" />
-</p>
-
-## Apply for Dataset 
-
-You can access the FAVDBench dataset by visiting the [OpenNLPLab/Download](http://www.avlbench.opennlplab.cn/download) webpage. To obtain the dataset, please complete the corresponding [Google Forms](https://forms.gle/5S3DWpBaV1UVczkf8). Once we receive your application, we will respond promptly. Alternatively, if you encounter any issues with the form, you can also submit your application (indicating your Name, Affiliation) via email to opennlplab@gmail.com.
-
-* FAVDBench Dataset Google Forms: https://forms.gle/5S3DWpBaV1UVczkf8
-
-These downloaded data can be placed or linked to the directory `AVLFormer/datasets`.
-
-
-## Installation
-In general, the code requires `python>=3.7`, as well as `pytorch>=1.10` and `torchvision>=0.8`. You can follow [`recommend_env.sh`]('https://github.com/OpenNLPLab/FAVDBench/blob/main/recommend_env.sh) to configure a recommend conda environment:
-1. Create virtual env
-    ```bash
-    conda create -n FAVDBench; conda activate FAVDBench
-    ```
-2. Install pytorch-related packages:
-    ```bash
-    conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
-    ```
-3. Install basic packages:
-    ```bash
-    pip install fairscale opencv-python
-    pip install deepspeed PyYAML fvcore ete3 transformers pandas timm h5py
-    pip install tensorboardX easydict progressbar matplotlib future deprecated scipy av scikit-image boto3 einops addict yapf
-    ```
-4. Install mmcv-full
-    ```bash
-    pip install mmcv-full==1.6.1 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.12/index.html
-    ```
-
-5. Install apex
-    ```bash
-    git clone https://github.com/NVIDIA/apex
-    cd apex
-    pip install -v --disable-pip-version-check --no-cache-dir ./
-    ```
-    
-6. Clone related repo for eval 
-    ```bash
-    cd ./AVLFormer/src/evalcap
-    git clone https://github.com/xiaoweihu/cider.git
-    git clone https://github.com/LuoweiZhou/coco-caption.git
-    mv ./coco-caption ./coco_caption 
-    ```
-
-7. Install ffmpeg & ffprobe
-  * Use `ffmpeg -version` and `ffprobe -version` to check whether ffmpeg and ffprobe are installed.
-  * Installation guideline: 
-
-    ```bash
-      # For ubuntu
-      sudo apt update
-      sudo apt install ffmpeg
-
-      # For mac
-      brew update
-      brew install ffmpeg
-    ```
   
 
-## Dataset Preparation
-
-
-## License
-
-
-## Citation
-If you use FAVD or FAVDBench in your research, please use the following BibTeX entry.
-
-```
-@InProceedings{Shen_2023_CVPR,
-    author    = {Shen, Xuyang and Li, Dong and Zhou, Jinxing and Qin, Zhen and He, Bowen and Han, Xiaodong and Li, Aixuan and Dai, Yuchao and Kong, Lingpeng and Wang, Meng and Qiao, Yu and Zhong, Yiran},
-    title     = {Fine-Grained Audible Video Description},
-    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
-    month     = {June},
-    year      = {2023},
-    pages     = {10585-10596}
-}
-```
+---
+### Empty branch, please return to main branch
+---
\ No newline at end of file
diff --git a/images/task_intro.png b/images/task_intro.png
deleted file mode 100644
index 8967dfb..0000000
Binary files a/images/task_intro.png and /dev/null differ
diff --git a/recommend_env.sh b/recommend_env.sh
deleted file mode 100644
index fb5e8c5..0000000
--- a/recommend_env.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-# create virtual env
-conda create -n FAVDBench
-# activate virtual env
-conda activate FAVDBench
-
-# install packages
-conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
-pip install fairscale opencv-python
-pip install deepspeed PyYAML fvcore ete3 transformers pandas timm h5py
-pip install tensorboardX easydict progressbar matplotlib future deprecated scipy av scikit-image boto3 einops addict yapf
-
-# install mmcv
-pip install mmcv-full==1.6.1 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.12/index.html
-
-# install apex
-git clone https://github.com/NVIDIA/apex
-cd apex
-pip install -v --disable-pip-version-check --no-cache-dir ./
-
-# clone related repo for eval
-cd ./AVLFormer/src/evalcap
-git clone https://github.com/xiaoweihu/cider.git
-git clone https://github.com/LuoweiZhou/coco-caption.git
-mv ./coco-caption ./coco_caption 
\ No newline at end of file