Skip to content

Commit

Permalink
Added code for offline use.
Browse files Browse the repository at this point in the history
  • Loading branch information
mille-s committed Nov 29, 2022
1 parent ac5846e commit 4eec0a8
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions UD_Converter_release.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@
"from IPython.display import clear_output \n",
"! gdown 1m4KZt3HO1O0bgLiddBfpDkpvMjS6XSbZ\n",
"! unzip /content/UD_Converter_colab.zip\n",
"clear_output()\n",
"#clear_output()\n",
"\n",
"# If any issue, the zip file can be found in the following shared folder: https://drive.google.com/file/d/1m4KZt3HO1O0bgLiddBfpDkpvMjS6XSbZ/view?usp=sharing"
],
"metadata": {
"id": "gW1f_juqCmNC"
},
"execution_count": 1,
"execution_count": null,
"outputs": []
},
{
Expand Down Expand Up @@ -70,7 +70,7 @@
"#============================================================================================================\n",
"# GENERAL PARAMETERS (please read comments before parameters to avoid most errors)\n",
"#============================================================================================================\n",
"# path to working folder (which must contain the buddy-core tools that convert to t2)\n",
"# path to working folder (which must contain the buddy-core tools that convert to t2 and the other .py files)\n",
"path_jars = '/content/UD_Converter'\n",
"# !!! path to input folder; file names in the input folder should not contain spaces or parentheses\n",
"# !!! needs to exist and have some CoNLL(-U) files inside, if possible with a 2-letter prefix to indicate the language (e.g. en_ewt-UD.conllu, fr_myfile.conllu); supported: en, fr, es\n",
Expand Down Expand Up @@ -209,6 +209,8 @@
"print('\\nChecking files to split...\\n')\n",
"path_splitFiles = os.path.join(path_jars, 'splitFiles.py')\n",
"!python {path_splitFiles} {inputFolder} 'utf-8' {strPerFile} 'all' {tmpIn}\n",
"# Code for offline usage; replace previous line by following:\n",
"# subprocess.call(['python', 'splitFiles.py', inputFolder, 'utf-8', strPerFile, 'all', tmpIn])\n",
"\n",
"# Conversion to format that can be loaded by the .jar.\n",
"# Parameters:\n",
Expand All @@ -221,6 +223,8 @@
"convertFolder = tmpIn\n",
"path_conllu2conll = os.path.join(path_jars, 'conllu2conll.py')\n",
"!python {path_conllu2conll} {inputFormat} {convertFolder} {originalID} {originalForm} {originalXpos} {parentheses} {quotationMarks} {orderPunc} {orderConj} {orderMWE} {track} {dt} {sentOutTmp} {reduce_tree}\n",
"# Code for offline usage; replace previous line by following:\n",
"# subprocess.call(['python', 'conllu2conll.py', inputFormat, convertFolder, originalID, originalForm, originalXpos, parentheses, quotationMarks, orderPunc, orderConj, orderMWE, track, dt, sentOutTmp, reduce_tree])\n",
"\n",
"# Scrambling of the files to remove order information.\n",
"# Parameters:\n",
Expand All @@ -237,6 +241,8 @@
"for file2Scramble in files2Scramble:\n",
" if scramble == 'yes':\n",
" !python {path_conllScramble} {file2Scramble} {surfOutTmp} {track} {dt} {tmpIn}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'conllScramble.py', file2Scramble, surfOutTmp, track, dt, tmpIn])\n",
" else:\n",
" print('\\nNo file scrambled!')\n",
" copyfile(os.path.join(tmpIn, 'conllu2conll', file2Scramble), os.path.join(surfOutTmp, file2Scramble))\n",
Expand Down Expand Up @@ -290,18 +296,24 @@
" if len(dir_contents_deep) > 0:\n",
" print('\\nConcatenating deep structures...')\n",
" !python {path_concatenateFiles} {deepOutTmp} {deepOut} 'utf-8' 'utf-8' {inputFormat} 'deep' {track} {dt}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'concatenateFiles.py', deepOutTmp, deepOut, 'utf-8', 'utf-8', inputFormat, 'deep', track, dt])\n",
"\n",
"if os.path.exists(surfOutTmp):\n",
" dir_contents_surf = [x for x in os.listdir(surfOutTmp) if not x.startswith('.')]\n",
" if len(dir_contents_surf) > 0:\n",
" print('\\nConcatenating surface structures...')\n",
" !python {path_concatenateFiles} {surfOutTmp} {surfOut} 'utf-8' 'utf-8' {inputFormat} 'surf' {track} {dt}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'concatenateFiles.py', surfOutTmp, surfOut, 'utf-8', 'utf-8', inputFormat, 'surf', track, dt])\n",
" \n",
"if os.path.exists(sentOutTmp):\n",
" dir_contents_sent = [x for x in os.listdir(sentOutTmp) if not x.startswith('.')]\n",
" if len(dir_contents_sent) > 0:\n",
" print('\\nConcatenating sentences...')\n",
" !python {path_concatenateFiles} {sentOutTmp} {sentOut} 'utf-8' 'utf-8' 'txt' 'sent' {track} {dt}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'concatenateFiles.py', sentOutTmp, sentOut, 'utf-8', 'utf-8', 'txt', 'sent', track, dt])\n",
"\n",
"try:\n",
" shutil.rmtree(tmpOut)\n",
Expand All @@ -321,6 +333,8 @@
" path_checkWellFormedness = os.path.join(path_jars, 'checkWellFormedness.py')\n",
"\n",
" !python {path_checkAlignments} {inputFolder} {surfOut} {debugFolder} 'utf-8' 'UD2surf' {dt} {scramble}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'checkAlignments.py', inputFolder, surfOut, debugFolder, 'utf-8', 'UD2surf', dt, scramble])\n",
"\n",
" # If the deep structures were kept from a previous execution, check their alignment too\n",
" if keep_deep == 'yes':\n",
Expand All @@ -330,9 +344,13 @@
"\n",
" print('\\nChecking alignments between surface and deep files......\\n')\n",
" !python {path_checkAlignments} {surfOut} {deepOut} {debugFolder} 'utf-8' 'surf2deep' {dt} {scramble}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'checkAlignments.py', surfOut, deepOut, debugFolder, 'utf-8', 'surf2deep', dt, scramble])\n",
"\n",
" print('\\nChecking alignments between original UD and deep files......\\n')\n",
" !python {path_checkAlignments} {inputFolder} {deepOut} {debugFolder} 'utf-8' 'UD2deep' {dt} {scramble}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'checkAlignments.py', inputFolder, deepOut, debugFolder, 'utf-8', 'UD2deep', dt, scramble])\n",
" \n",
" print('\\nChecking deep tree well-formedness...\\n')\n",
" # File check: a small script that checks the contents of the output files. It looks for configurations that in theory should not happen: disconnections, cycles, repeated argument numbers, multiple incoming dependencies (in case of tree input). A log file is created in the debug folder (log_treeness.txt), and optionally, folders with the ill-formed files.\n",
Expand All @@ -347,6 +365,8 @@
" for outFile in listFinalFilepaths:\n",
" print(outFile)\n",
" !python {path_checkWellFormedness} {debugFolder} {outFile} 'utf-8' 'tree' {inputFolder} {inputFormat}\n",
" # Code for offline usage; replace previous line by following:\n",
" # subprocess.call(['python', 'checkWellFormedness.py', debugFolder, outFile, 'utf-8', 'tree', inputFolder, inputFormat])\n",
" else:\n",
" pass\n",
" \n",
Expand Down

0 comments on commit 4eec0a8

Please sign in to comment.