diff --git a/finetuning-English-GPT2-language-Polish-HuggingFace-fastaiv2.ipynb b/finetuning-English-GPT2-language-Polish-HuggingFace-fastaiv2.ipynb new file mode 100644 index 0000000..1bf000a --- /dev/null +++ b/finetuning-English-GPT2-language-Polish-HuggingFace-fastaiv2.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.7"},"colab":{"name":"05_finetuning-English-GPT2-any-language-Polish-HuggingFace-fastaiv2.ipynb","provenance":[{"file_id":"1n7sol-CvBSblO33ScpgQQIfy5v_VJVQb","timestamp":1597923568454},{"file_id":"1qrhSZ4nBKgv2sz_-OjzEsiuWYHUyOeq9","timestamp":1597489060569},{"file_id":"1d_tsC-i3804eHIBJsoy6QNZ6Jm4JtgJ-","timestamp":1597391589678}],"collapsed_sections":["D18pRY2C9NUf","eAs4xPsR9NUj","Z6oUdKay9NUv","kQrp3wcB9NVL","DAvDfxhq9NVQ","6NUQ_V299NVR","7bKWxnx19NVT","i2frlbhv9NVU","j6WB2Qyy9NVX","P-2JYQnD9N6F","NBx8vCFkBtDu","6m4mW5zf9NZZ","GeSgUiiz9NZq","ZmCINL21SHRV","Tazygj-q9NaT","V2tWq54U9NcQ","N58GF3Xe9NcR","6tR_rxei9NcR","0XfIwXNa9NcS","WSUu-lDh9NcW","q_xSs_0P9Nct","gNw5iw0w9Nc0","1TtSvmLQ9NdK"],"toc_visible":true},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"D18pRY2C9NUf"},"source":["# Faster than training from scratch \n","# Fine-tuning the English GPT-2 in any language with Hugging Face and fastai v2 \n","\n","> Tutorial on how to use fastai v2 over Hugging Face's Transformers and Tokenizers libraries to fine-tune an English pre-trained transformer-based language model (GPT-2) to any language other than English"]},{"cell_type":"markdown","metadata":{"id":"C-Cx5Xrk9NUh"},"source":["Notebook is based on work of Pierre Guillou (https://www.linkedin.com/in/pierreguillou)\n","\n","Other resources used:\n","---\n","\n","\n","- Post in medium: [Faster than training from scratch - Fine-tuning the English GPT-2 in any language with Hugging Face and fastai v2 (practical case with Portuguese)](https://medium.com/@pierre_guillou/faster-than-training-from-scratch-fine-tuning-the-english-gpt-2-in-any-language-with-hugging-f2ec05c98787)\n","- Fast notebook: [finetuning-English-GPT2-any-language-Portuguese-HuggingFace-fastaiv2_FAST.ipynb](https://github.com/piegu/fastai-projects/blob/master/finetuning-English-GPT2-any-language-Portuguese-HuggingFace-fastaiv2_FAST.ipynb)\n","- Hugging face model page of [GPorTuguese-2](https://huggingface.co/pierreguillou/gpt2-small-portuguese): a language model for Portuguese text generation (and more NLP tasks...)\n","- Other posts in medium of the GPT-2 series: \n"," - [NLP & fastai | GPT-2](https://medium.com/@pierre_guillou/nlp-fastai-gpt-2-16ee145a4a28)\n"," - [Byte-level BPE, an universal tokenizer but...](https://medium.com/@pierre_guillou/byte-level-bpe-an-universal-tokenizer-but-aff932332ffe)"]},{"cell_type":"code","metadata":{"id":"VI_AUR8K9ncO","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616061296952,"user_tz":-60,"elapsed":30851,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"c1d01012-4596-4c52-ef71-bf2e32ea8d76"},"source":["#start by mounting google drive\n","from google.colab import drive, files\n","drive.mount('/content/gdrive', force_remount=True)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Mounted at /content/gdrive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"yzT-SC9hmTKG","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1615837934244,"user_tz":-60,"elapsed":167861,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"36c1fb8b-6b9e-4fb1-be39-8b68fdfbfa23"},"source":["# need to instal fastai 2 etc before \n","!pip install -q git+https://github.com/fastai/fastai\n","!pip install -q git+https://github.com/fastai/fastcore\n","!pip install -q iterative-stratification"],"execution_count":null,"outputs":[{"output_type":"stream","text":["\u001b[K |████████████████████████████████| 61kB 3.0MB/s \n","\u001b[K |████████████████████████████████| 12.8MB 326kB/s \n","\u001b[K |████████████████████████████████| 776.8MB 21kB/s \n","\u001b[?25h Building wheel for fastai (setup.py) ... \u001b[?25l\u001b[?25hdone\n","\u001b[31mERROR: torchtext 0.9.0 has requirement torch==1.8.0, but you'll have torch 1.7.1 which is incompatible.\u001b[0m\n"," Building wheel for fastcore (setup.py) ... \u001b[?25l\u001b[?25hdone\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"g-Zna9nuQE8C","executionInfo":{"status":"ok","timestamp":1616061331938,"user_tz":-60,"elapsed":582,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"ec9b1082-42d2-4773-ece4-ad60c5f567f7"},"source":["cd /content/gdrive/MyDrive/fastai"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/content/gdrive/MyDrive/fastai\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"a4VVvLIhQBjv"},"source":["from nlputilsfastai import * # augumented py file ---> from fastai.basics import * # was fastai2"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":309},"id":"6prc1OyVI221","executionInfo":{"status":"ok","timestamp":1615837583603,"user_tz":-60,"elapsed":2993,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"d9fc6e69-482f-496f-e52c-f49b41a3694b"},"source":["# !pip install fastcore==1.3.8"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Collecting fastcore==1.3.8\n","\u001b[?25l Downloading https://files.pythonhosted.org/packages/26/53/d79c0f942f8bb44903108462541130b53fc7b4d744b1b5df9127b0b524d6/fastcore-1.3.8-py3-none-any.whl (48kB)\n","\r\u001b[K |██████▉ | 10kB 19.8MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 20kB 25.6MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 30kB 23.5MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 40kB 26.4MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 51kB 5.8MB/s \n","\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from fastcore==1.3.8) (20.9)\n","Requirement already satisfied: pip in /usr/local/lib/python3.7/dist-packages (from fastcore==1.3.8) (19.3.1)\n","Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->fastcore==1.3.8) (2.4.7)\n","Installing collected packages: fastcore\n"," Found existing installation: fastcore 1.3.20\n"," Uninstalling fastcore-1.3.20:\n"," Successfully uninstalled fastcore-1.3.20\n","Successfully installed fastcore-1.3.8\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["fastcore"]}}},"metadata":{"tags":[]}}]},{"cell_type":"markdown","metadata":{"id":"CmKm8BETBpB2"},"source":["# 1. Installing required libraries and mounting google drive"]},{"cell_type":"code","metadata":{"id":"nqp4kpUG9tsV"},"source":["#start by mounting google drive\n","from google.colab import drive, files\n","drive.mount('/content/gdrive', force_remount=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"M2y5n_KU99lq","executionInfo":{"status":"ok","timestamp":1616146896947,"user_tz":-60,"elapsed":51225,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"48ff0ebc-9a18-4c63-d0da-b8bcd92a5dbb"},"source":["# need to instal fastai 2 etc before \n","%%time\n","!pip install -q git+https://github.com/fastai/fastai\n","!pip install -q git+https://github.com/fastai/fastcore\n","!pip install -q iterative-stratification"],"execution_count":2,"outputs":[{"output_type":"stream","text":[" Building wheel for fastai (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Building wheel for fastcore (setup.py) ... \u001b[?25l\u001b[?25hdone\n","CPU times: user 121 ms, sys: 34.4 ms, total: 156 ms\n","Wall time: 50.3 s\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"P-2JYQnD9N6F"},"source":["# 2. Initialization"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"A7Z0xlgv-TvJ","executionInfo":{"status":"ok","timestamp":1616146896951,"user_tz":-60,"elapsed":42139,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"cd263a63-2153-449f-f332-860943bdc36c"},"source":["cd /content/gdrive/MyDrive/fastai"],"execution_count":3,"outputs":[{"output_type":"stream","text":["/content/gdrive/MyDrive/fastai\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"nbpresent":{"id":"151cd18f-76e3-440f-a8c7-ffa5c6b5da01"},"id":"RsJYkiK99N6G","executionInfo":{"status":"ok","timestamp":1616146901723,"user_tz":-60,"elapsed":46905,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# from fastai2.text.all import *\n","# from nlputils_fastai2 import * \n","\n","from fastai.text.all import *\n","from nlputilsfastai import * \n","\n","%reload_ext autoreload\n","%autoreload 2\n","%matplotlib inline"],"execution_count":4,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"lHjl3W7HBdej","executionInfo":{"status":"ok","timestamp":1616146901725,"user_tz":-60,"elapsed":46901,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"fd434dfa-63ba-4bb6-c99c-769fd52fa102"},"source":["gpu = 0\n","torch.cuda.set_device(gpu)\n","print(f'cuda device: {torch.cuda.current_device()}')\n","print(f'cuda device name: {torch.cuda.get_device_name(gpu)}')"],"execution_count":5,"outputs":[{"output_type":"stream","text":["cuda device: 0\n","cuda device name: Tesla K80\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gK0-GZGhC4nF","executionInfo":{"status":"ok","timestamp":1616146901727,"user_tz":-60,"elapsed":46900,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"1e00af5d-ec1c-4e7d-ecac-fc28f5855bb5"},"source":["!nvidia-smi"],"execution_count":6,"outputs":[{"output_type":"stream","text":["Fri Mar 19 09:41:44 2021 \n","+-----------------------------------------------------------------------------+\n","| NVIDIA-SMI 460.56 Driver Version: 460.32.03 CUDA Version: 11.2 |\n","|-------------------------------+----------------------+----------------------+\n","| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|===============================+======================+======================|\n","| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n","| N/A 74C P8 33W / 149W | 3MiB / 11441MiB | 0% Default |\n","| | | N/A |\n","+-------------------------------+----------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=============================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------+\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"pZIeiQm9Cya8"},"source":["Load standard snipet to prevent random disconnects\n","This cell runs JS code to automatic reconnect to runtime."]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"id":"T-N9y4S6C1I5","executionInfo":{"status":"ok","timestamp":1616146901728,"user_tz":-60,"elapsed":46896,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"c9ee6dee-4748-4dde-8f9a-2bfea3f6471e"},"source":["import IPython\n","from google.colab import output\n","\n","display(IPython.display.Javascript('''\n"," function ClickConnect(){\n"," btn = document.querySelector(\"colab-connect-button\")\n"," if (btn != null){\n"," console.log(\"Click colab-connect-button\"); \n"," btn.click() \n"," }\n"," \n"," btn = document.getElementById('ok')\n"," if (btn != null){\n"," console.log(\"Click reconnect\"); \n"," btn.click() \n"," }\n"," }\n"," \n","setInterval(ClickConnect,60000)\n","'''))\n","\n","print(\"Done.\")"],"execution_count":7,"outputs":[{"output_type":"display_data","data":{"application/javascript":["\n"," function ClickConnect(){\n"," btn = document.querySelector(\"colab-connect-button\")\n"," if (btn != null){\n"," console.log(\"Click colab-connect-button\"); \n"," btn.click() \n"," }\n"," \n"," btn = document.getElementById('ok')\n"," if (btn != null){\n"," console.log(\"Click reconnect\"); \n"," btn.click() \n"," }\n"," }\n"," \n","setInterval(ClickConnect,60000)\n"],"text/plain":[""]},"metadata":{"tags":[]}},{"output_type":"stream","text":["Done.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"o2qnh-a79N6H","executionInfo":{"status":"ok","timestamp":1616146901729,"user_tz":-60,"elapsed":46893,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"a8290a41-8a22-43cf-ae87-e30f727e693c"},"source":["# Get config of fastai2 paths\n","config = Config()\n","config.d"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'archive_path': '/root/.fastai/archive',\n"," 'data_path': '/root/.fastai/data',\n"," 'model_path': '/root/.fastai/models',\n"," 'storage_path': '/tmp',\n"," 'version': 2}"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"markdown","metadata":{"nbpresent":{"id":"cf070ab7-babb-4cf0-a315-401f65461dc8"},"id":"5pL4tfG49N6I"},"source":["This will create a `{lang}wiki` folder, containing a `{lang}wiki` text file with the wikipedia contents (for other languages, replace `{lang}` with the appropriate code from the [list of wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias))."]},{"cell_type":"code","metadata":{"id":"2ShRUXWj_NoG","executionInfo":{"status":"ok","timestamp":1616146901729,"user_tz":-60,"elapsed":46889,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# setup new path_data and create the corresponding folder\n","lang = 'pl'\n","name = f'{lang}wiki'\n","data_path = config['data_path']\n","path_data = data_path/name\n","path_data.mkdir(exist_ok=True, parents=True)"],"execution_count":9,"outputs":[]},{"cell_type":"code","metadata":{"id":"NykPqucXA0hF","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616146901730,"user_tz":-60,"elapsed":46886,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"7720a172-7887-4f9a-b16c-91d0b5c6f7b2"},"source":["cd /content/gdrive/MyDrive/fastai"],"execution_count":10,"outputs":[{"output_type":"stream","text":["/content/gdrive/MyDrive/fastai\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"9YmkrjvBDPPr","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616146901731,"user_tz":-60,"elapsed":46883,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"3882c85c-3884-4bb3-e02a-2c7333a919bd"},"source":["data_path, path_data"],"execution_count":11,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(Path('/root/.fastai/data'), Path('/root/.fastai/data/plwiki'))"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"code","metadata":{"id":"egNzIEWtylqk","executionInfo":{"status":"ok","timestamp":1616146901731,"user_tz":-60,"elapsed":46878,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":[],"execution_count":11,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"NBx8vCFkBtDu"},"source":["# 3. Loading previously prepared scraped wiki file ~1G for particular language\n","for that purpose another notebook was used [wiki download](https://github.com/len-sla/other/blob/main/wiki_download.ipynb)"]},{"cell_type":"code","metadata":{"id":"pyZnd8Srze_Z","executionInfo":{"status":"ok","timestamp":1616146942655,"user_tz":-60,"elapsed":81985,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["!cp /content/gdrive/MyDrive/fastai/all_texts_plwiki.csv /root/.fastai/data/plwiki\n","!cp /content/gdrive/MyDrive/fastai/all_texts_plwiki.txt /root/.fastai/data/plwiki"],"execution_count":12,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"N-2ccH-IAuUC","executionInfo":{"status":"ok","timestamp":1616146942657,"user_tz":-60,"elapsed":81981,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"db143030-0b3b-41b4-e13e-c03a08271b4c"},"source":["!du -hs {'/content/gdrive/MyDrive/fastai/all_texts_plwiki.csv'}"],"execution_count":13,"outputs":[{"output_type":"stream","text":["1.1G\t/content/gdrive/MyDrive/fastai/all_texts_plwiki.csv\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":0},"id":"fdlfLPzP_3Z0","executionInfo":{"status":"ok","timestamp":1616146957043,"user_tz":-60,"elapsed":96361,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"b57cb4d5-62f8-4d16-c5d7-52e1466f8622"},"source":["df = pd.read_csv('/content/gdrive/MyDrive/fastai/all_texts_plwiki.csv')\n","df.head()"],"execution_count":14,"outputs":[{"output_type":"execute_result","data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
text
0Henry Wager Halleck (ur. 16 stycznia 1815, zm. 9 stycznia 1872) – amerykański wojskowy, naukowiec i prawnik, oficer United States Army.\\n\\n, znany pod – obraźliwym później – przydomkiem „Old Brains”, brał czynny udział w dziele przyłączenia Kalifornii jako stanu. Z powodzeniem praktykował jako prawnik i deweloper. Na początku wojny secesyjnej, był naczelnym dowódcą Armii Unii na zachodnim teatrze działań, a jednocześnie – przez prawie dwa lata – głównodowodzącym wszystkich armii USA. „Awansował” na szefa sztabu armii, gdy generał-porucznik Ulysses Grant, były podkomendny Hallecka na zachod...
1Kościół Najświętszej Marii Panny (\"in summo\") w Poznaniu – zabytkowy gotycki kościół na Ostrowie Tumskim wraz z resztkami wczesnopiastowskiego palatium.\\n\\nW dzisiejszym kształcie powstał w połowie XV wieku, jednak jego historia rozpoczyna się około 965 roku, gdy po przybyciu Dobrawy wzniesiono na Ostrowie Tumskim kaplicę zamkową. W dokumentach kościół Najświętszej Marii Panny pod swoim dzisiejszym wezwaniem pojawia się po raz pierwszy w 1247. \\n\\nWedług najnowszych badań prawdopodobnie pod prezbiterium znajdują się fundamenty rotundy pełniącej funkcję kaplicy, pewnym jest natomiast istnie...
2Gieorgij Andriejewicz Mołczanow (ros. Георгий Андреевич Молчанов, ur. 3 kwietnia 1897 w Charkowie, zm. 9 października 1937 w miejscu egzekucji Kommunarka) – funkcjonariusz radzieckiej policji politycznej, komisarz bezpieczeństwa państwowego II rangi, ludowy komisarz spraw wewnętrznych Białoruskiej SRR (1936-1937).\\n\\nUrodzony w rodzinie rosyjskiej. Do 1917 uczył się w szkole handlowej w Charkowie, od listopada 1917 do czerwca 1918 był żołnierzem i członkiem sztabu Głównodowodzącego Wojsk Południa Rosji Antonowa-Owsiejenki, później pracował w sztabie Frontu Wschodniego. \\n\\nOd grudnia 1917 ...
3José Manuel Durão Barroso (wym. []; ur. 23 marca 1956 w Lizbonie) – portugalski polityk, prawnik i nauczyciel akademicki. W latach 1992–1995 minister spraw zagranicznych w rządzie Aníbal Cavaco Silvy, od 1999 do 2004 przewodniczący Partii Socjaldemokratycznej. Premier Portugalii od 6 kwietnia 2002 do 17 lipca 2004. Od 22 listopada 2004 do 31 października 2014 przewodniczący Komisji Europejskiej.\\n\\nUkończył prawo na Uniwersytecie Lizbońskim, a także studia europejskie na Uniwersytecie Genewskim, na którym uzyskał również magisterium w zakresie nauk politycznych. Pracował jako nauczyciel ak...
4Laodika I (gr. \"Λαοδίκη\", \"Laodíkē\") (zm. po 242 p.n.e.) – córka Achajosa Starszego z dynastii Seleucydów, brata Antiocha I Sotera, pierwsza żona brata stryjecznego Antiocha II Theosa, króla państwa Seleucydów, syna Antiocha I Sotera.\\n\\nW czasie II wojny syryjskiej (258-248 p.n.e.) jej mąż Antioch II Theos, jako sprzymierzeniec Macedonii walczył przeciwko Egiptowi. W wyniku tej wojny Antioch II zawarł porozumienie z królem Egiptu Ptolemeuszem II Filadelfem w r. 250 p.n.e. Miał się wyprzeć żony Laodiki I i wspólnych z nią dzieci, a poślubić jego córkę Berenikę oraz zdeklarować się uczynić ...
\n","
"],"text/plain":[" text\n","0 Henry Wager Halleck (ur. 16 stycznia 1815, zm. 9 stycznia 1872) – amerykański wojskowy, naukowiec i prawnik, oficer United States Army.\\n\\n, znany pod – obraźliwym później – przydomkiem „Old Brains”, brał czynny udział w dziele przyłączenia Kalifornii jako stanu. Z powodzeniem praktykował jako prawnik i deweloper. Na początku wojny secesyjnej, był naczelnym dowódcą Armii Unii na zachodnim teatrze działań, a jednocześnie – przez prawie dwa lata – głównodowodzącym wszystkich armii USA. „Awansował” na szefa sztabu armii, gdy generał-porucznik Ulysses Grant, były podkomendny Hallecka na zachod...\n","1 Kościół Najświętszej Marii Panny (\"in summo\") w Poznaniu – zabytkowy gotycki kościół na Ostrowie Tumskim wraz z resztkami wczesnopiastowskiego palatium.\\n\\nW dzisiejszym kształcie powstał w połowie XV wieku, jednak jego historia rozpoczyna się około 965 roku, gdy po przybyciu Dobrawy wzniesiono na Ostrowie Tumskim kaplicę zamkową. W dokumentach kościół Najświętszej Marii Panny pod swoim dzisiejszym wezwaniem pojawia się po raz pierwszy w 1247. \\n\\nWedług najnowszych badań prawdopodobnie pod prezbiterium znajdują się fundamenty rotundy pełniącej funkcję kaplicy, pewnym jest natomiast istnie...\n","2 Gieorgij Andriejewicz Mołczanow (ros. Георгий Андреевич Молчанов, ur. 3 kwietnia 1897 w Charkowie, zm. 9 października 1937 w miejscu egzekucji Kommunarka) – funkcjonariusz radzieckiej policji politycznej, komisarz bezpieczeństwa państwowego II rangi, ludowy komisarz spraw wewnętrznych Białoruskiej SRR (1936-1937).\\n\\nUrodzony w rodzinie rosyjskiej. Do 1917 uczył się w szkole handlowej w Charkowie, od listopada 1917 do czerwca 1918 był żołnierzem i członkiem sztabu Głównodowodzącego Wojsk Południa Rosji Antonowa-Owsiejenki, później pracował w sztabie Frontu Wschodniego. \\n\\nOd grudnia 1917 ...\n","3 José Manuel Durão Barroso (wym. []; ur. 23 marca 1956 w Lizbonie) – portugalski polityk, prawnik i nauczyciel akademicki. W latach 1992–1995 minister spraw zagranicznych w rządzie Aníbal Cavaco Silvy, od 1999 do 2004 przewodniczący Partii Socjaldemokratycznej. Premier Portugalii od 6 kwietnia 2002 do 17 lipca 2004. Od 22 listopada 2004 do 31 października 2014 przewodniczący Komisji Europejskiej.\\n\\nUkończył prawo na Uniwersytecie Lizbońskim, a także studia europejskie na Uniwersytecie Genewskim, na którym uzyskał również magisterium w zakresie nauk politycznych. Pracował jako nauczyciel ak...\n","4 Laodika I (gr. \"Λαοδίκη\", \"Laodíkē\") (zm. po 242 p.n.e.) – córka Achajosa Starszego z dynastii Seleucydów, brata Antiocha I Sotera, pierwsza żona brata stryjecznego Antiocha II Theosa, króla państwa Seleucydów, syna Antiocha I Sotera.\\n\\nW czasie II wojny syryjskiej (258-248 p.n.e.) jej mąż Antioch II Theos, jako sprzymierzeniec Macedonii walczył przeciwko Egiptowi. W wyniku tej wojny Antioch II zawarł porozumienie z królem Egiptu Ptolemeuszem II Filadelfem w r. 250 p.n.e. Miał się wyprzeć żony Laodiki I i wspólnych z nią dzieci, a poślubić jego córkę Berenikę oraz zdeklarować się uczynić ..."]},"metadata":{"tags":[]},"execution_count":14}]},{"cell_type":"markdown","metadata":{"id":"Zad8TrPNybqT"},"source":["# 4. copying ready polish tokenizer"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"S90iGQ1i1Dhi","executionInfo":{"status":"ok","timestamp":1616146961177,"user_tz":-60,"elapsed":95166,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"21bbac37-29a3-4a79-eacc-2332c164f59a"},"source":["%%time\n","!pip install transformers\n","!pip freeze | grep transformers"],"execution_count":15,"outputs":[{"output_type":"stream","text":["Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.4.2)\n","Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.43)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n","Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (3.7.2)\n","Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n","Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n","Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n","Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2020.12.5)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n","Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n","transformers==4.4.2\n","CPU times: user 14.3 ms, sys: 119 ms, total: 133 ms\n","Wall time: 4.05 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"x8q1Ck6J9N6W","executionInfo":{"status":"ok","timestamp":1616147005674,"user_tz":-60,"elapsed":4400,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"f4c0fcfd-e2a7-4a36-9125-40800cb8972a"},"source":["%%time\n","from transformers import GPT2TokenizerFast\n","\n","pretrained_weights = 'gpt2'\n","tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)"],"execution_count":27,"outputs":[{"output_type":"stream","text":["CPU times: user 136 ms, sys: 9.13 ms, total: 146 ms\n","Wall time: 3.41 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"25v40IRy01Ye","executionInfo":{"status":"ok","timestamp":1616147005678,"user_tz":-60,"elapsed":3431,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# To correct the warning about token_pad (GPT2TokenizerFast), run the following code\n","# source: https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044\n","tokenizer_en.pad_token = tokenizer_en.eos_token"],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"id":"IErVmjWa01Vl","executionInfo":{"status":"ok","timestamp":1616146965178,"user_tz":-60,"elapsed":99143,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":[],"execution_count":17,"outputs":[]},{"cell_type":"code","metadata":{"scrolled":true,"colab":{"base_uri":"https://localhost:8080/"},"id":"LSV4oDRI9N6W","executionInfo":{"status":"ok","timestamp":1616147005681,"user_tz":-60,"elapsed":2407,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"4266ac07-7811-432d-8bb1-c8ab618a83fa"},"source":["# source: https://huggingface.co/transformers/_modules/transformers/tokenization_utils_fast.html\n","\n","print('---------- vocab ----------')\n","print()\n","\n","print('vocab_files_names:',tokenizer_en.vocab_files_names)\n","print()\n","\n","for k,v in tokenizer_en.pretrained_vocab_files_map.items():\n"," print(k)\n"," for kk,vv in v.items():\n"," print('- ',kk,':',vv)\n"," print()\n"," \n","print('vocab_size:',tokenizer_en.vocab_size)\n","print()\n","#print(tokenizer_en.get_vocab())\n","\n","num = 50\n","print(f'First {num} items of the vocab: {dict(itertools.islice(tokenizer_en.get_vocab().items(), 20))}')"],"execution_count":29,"outputs":[{"output_type":"stream","text":["---------- vocab ----------\n","\n","vocab_files_names: {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt', 'tokenizer_file': 'tokenizer.json'}\n","\n","vocab_file\n","- gpt2 : https://huggingface.co/gpt2/resolve/main/vocab.json\n","- gpt2-medium : https://huggingface.co/gpt2-medium/resolve/main/vocab.json\n","- gpt2-large : https://huggingface.co/gpt2-large/resolve/main/vocab.json\n","- gpt2-xl : https://huggingface.co/gpt2-xl/resolve/main/vocab.json\n","- distilgpt2 : https://huggingface.co/distilgpt2/resolve/main/vocab.json\n","\n","merges_file\n","- gpt2 : https://huggingface.co/gpt2/resolve/main/merges.txt\n","- gpt2-medium : https://huggingface.co/gpt2-medium/resolve/main/merges.txt\n","- gpt2-large : https://huggingface.co/gpt2-large/resolve/main/merges.txt\n","- gpt2-xl : https://huggingface.co/gpt2-xl/resolve/main/merges.txt\n","- distilgpt2 : https://huggingface.co/distilgpt2/resolve/main/merges.txt\n","\n","tokenizer_file\n","- gpt2 : https://huggingface.co/gpt2/resolve/main/tokenizer.json\n","- gpt2-medium : https://huggingface.co/gpt2-medium/resolve/main/tokenizer.json\n","- gpt2-large : https://huggingface.co/gpt2-large/resolve/main/tokenizer.json\n","- gpt2-xl : https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json\n","- distilgpt2 : https://huggingface.co/distilgpt2/resolve/main/tokenizer.json\n","\n","vocab_size: 50257\n","\n","First 50 items of the vocab: {'Ġhive': 35881, 'ãĥ': 1209, 'ĠKe': 3873, 'Ġpolitical': 1964, 'Ġconclusions': 13242, 'Ġdeputies': 21861, 'Bern': 23927, 'ĠGG': 37442, 'quished': 39737, 'especially': 16480, 'Ġ122': 19409, 'ceans': 19961, 'ishment': 17862, 'Ġ2007': 4343, '????????': 35709, 'Kansas': 43451, 'Ġrepentance': 45893, 'Ġtexted': 47358, 'chery': 31132, 'ĠInspector': 24625}\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vspJiJsr01P8","executionInfo":{"status":"ok","timestamp":1616147013470,"user_tz":-60,"elapsed":4810,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"88c58dfc-8284-4775-d914-ccbf250ceed3"},"source":["!pip install tokenizers\n","!pip freeze | grep tokenizers"],"execution_count":30,"outputs":[{"output_type":"stream","text":["Requirement already satisfied: tokenizers in /usr/local/lib/python3.7/dist-packages (0.10.1)\n","tokenizers==0.10.1\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"2om1-fULxN6Y","executionInfo":{"status":"ok","timestamp":1616147013472,"user_tz":-60,"elapsed":1737,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# creating directory for tokenizer\n","ByteLevelBPE_tokenizer_pl_rep = 'ByteLevelBPE_tokenizer_pl'\n","path_to_ByteLevelBPE_tokenizer_pl_rep = path_data/ByteLevelBPE_tokenizer_pl_rep\n","if not (path_to_ByteLevelBPE_tokenizer_pl_rep).exists():\n"," path_to_ByteLevelBPE_tokenizer_pl_rep.mkdir(exist_ok=True, parents=True)\n","# ByteLevelBPE_tokenizer_pl.save_model(str(path_to_ByteLevelBPE_tokenizer_pl_rep))"],"execution_count":31,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"beVl5rWQ16to","executionInfo":{"status":"ok","timestamp":1616147016007,"user_tz":-60,"elapsed":1564,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"c01b3da6-9f0f-4311-a5b3-1a97b773e1e1"},"source":["ls /root/.fastai/data/plwiki -all"],"execution_count":32,"outputs":[{"output_type":"stream","text":["total 2302132\n","drwxr-xr-x 3 root root 4096 Mar 19 09:07 \u001b[0m\u001b[01;34m.\u001b[0m/\n","drwxr-xr-x 3 root root 4096 Mar 19 08:57 \u001b[01;34m..\u001b[0m/\n","-rw------- 1 root root 1101183658 Mar 19 09:41 all_texts_plwiki.csv\n","-rw------- 1 root root 1098323868 Mar 19 09:42 all_texts_plwiki.txt\n","drwxr-xr-x 2 root root 4096 Mar 19 08:59 \u001b[01;34mByteLevelBPE_tokenizer_pl\u001b[0m/\n","-rw-r--r-- 1 root root 1216559 Mar 19 09:05 different_tokens_list.pl\n","-rw-r--r-- 1 root root 1640303 Mar 19 09:07 idxs_train.pl\n","-rw-r--r-- 1 root root 410351 Mar 19 09:07 idxs_val.pl\n","-rw-r--r-- 1 root root 154390264 Mar 19 09:05 new_wte_wgts.pl\n","-rw-r--r-- 1 root root 182831 Mar 19 09:05 same_tokens_list.pl\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"OXbGsBhBxN3P","executionInfo":{"status":"ok","timestamp":1616147021780,"user_tz":-60,"elapsed":1516,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["#copying previiously created pl okenizer ( saving ~30min fro preparing that)\n","!cp /content/gdrive/MyDrive/fastai/vocab.json /root/.fastai/data/plwiki/ByteLevelBPE_tokenizer_pl\n","!cp /content/gdrive/MyDrive/fastai/merges.txt /root/.fastai/data/plwiki/ByteLevelBPE_tokenizer_pl"],"execution_count":33,"outputs":[]},{"cell_type":"code","metadata":{"id":"VS0TEzJy3qBy","executionInfo":{"status":"ok","timestamp":1616147026724,"user_tz":-60,"elapsed":914,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["from tokenizers.implementations import ByteLevelBPETokenizer\n","ByteLevelBPE_tokenizer_pl = ByteLevelBPETokenizer(\n"," \"/root/.fastai/data/plwiki/ByteLevelBPE_tokenizer_pl/vocab.json\",\n"," \"/root/.fastai/data/plwiki/ByteLevelBPE_tokenizer_pl/merges.txt\",\n",")"],"execution_count":34,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"aB9cX5nV4kOv"},"source":["Testing if it is working"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Vwo-Baa4xN0w","executionInfo":{"status":"ok","timestamp":1616147031072,"user_tz":-60,"elapsed":909,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"7fc1f022-0e81-4353-bbad-c61ab03c8373"},"source":["# Get vocab as a list\n","ByteLevelBPE_tokenizer_pl_vocab = ByteLevelBPE_tokenizer_pl.get_vocab() \n","ByteLevelBPE_tokenizer_pl_vocab_ls = [k for k, v in sorted(ByteLevelBPE_tokenizer_pl_vocab.items(), key=lambda item: item[1])]\n","len(ByteLevelBPE_tokenizer_pl_vocab_ls),ByteLevelBPE_tokenizer_pl_vocab_ls[:5]"],"execution_count":35,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(50257, ['<|endoftext|>', '!', '\"', '#', '$'])"]},"metadata":{"tags":[]},"execution_count":35}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"1S_KAGDsxNyA","executionInfo":{"status":"ok","timestamp":1616147033190,"user_tz":-60,"elapsed":861,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"902a3535-ada4-4960-de1c-1a3a369d8bec"},"source":["text = \"Taki mały tekst dla sprawdzenia .\"\n","output = ByteLevelBPE_tokenizer_pl.encode(text)\n","print('\\n splitting by tokens\\n ')\n","print(output.ids,)\n","print(output.tokens)\n","print(output.offsets)\n","\n","back_to_text = ByteLevelBPE_tokenizer_pl.decode(ByteLevelBPE_tokenizer_pl.encode(text).ids)\n","\n","print('\\ninput text:', text)\n","print('tokens ids:', output.ids)\n","print('back to text:', back_to_text)"],"execution_count":36,"outputs":[{"output_type":"stream","text":["\n"," splitting by tokens\n"," \n","[5565, 335, 10120, 7591, 624, 1877, 1054, 4461]\n","['Ta', 'ki', 'ĠmaÅĤy', 'Ġtekst', 'Ġdla', 'Ġspraw', 'dzenia', 'Ġ.']\n","[(0, 2), (2, 4), (4, 9), (9, 15), (15, 19), (19, 25), (25, 31), (31, 33)]\n","\n","input text: Taki mały tekst dla sprawdzenia .\n","tokens ids: [5565, 335, 10120, 7591, 624, 1877, 1054, 4461]\n","back to text: Taki mały tekst dla sprawdzenia .\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"pnsLlsZVOf-_"},"source":[""]},{"cell_type":"markdown","metadata":{"id":"BYgcVlYB9NYd"},"source":["# 5. Create a fastai tokenizer and update the embeddings matrix of the GPT-2 English pre-trained model"]},{"cell_type":"markdown","metadata":{"id":"2Fmobv0m9NYd"},"source":["Now let's see how we can use fastai v2 to fine-tune this model on Wikipedia in Portuguese, using all the fastai v2 training utilities.\n","\n","We will follow these 2 following steps:"]},{"cell_type":"markdown","metadata":{"id":"mD9vUIko9NYd"},"source":["- 4.1) **GPT2TokenizerFast (imported GPT-2 tokenizer) --> fastai Tokenizer**: to process the data to train a model, we need to build a fastai tokenizer from the GPT-2 tokenizer with vocab in Portuguese.\n","- 4.2) **Change vocab embeddings (wte matrix) in the GPT-2 pre-trained model to adapt to the Portuguese vocab**: as the vocab embedding matrix (wte) of the pre-trained GPT-2 model corresponds to the English vocabulary, we'll keep the embeddings vectors of the common tokens between the English and Portuguese vocab."]},{"cell_type":"markdown","metadata":{"id":"hHrKpeRo9NYe"},"source":[" First, we import all the text utilities:"]},{"cell_type":"code","metadata":{"id":"63BrGAAX9NYe","executionInfo":{"status":"ok","timestamp":1616147047523,"user_tz":-60,"elapsed":1011,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["from fastai.text.all import *"],"execution_count":38,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dhfoTxlr9NYh"},"source":["#### 4.1 GPT2TokenizerFast (imported GPT-2 tokenizer) --> fastai Tokenizer"]},{"cell_type":"markdown","metadata":{"id":"wj-Y5lue9NYh"},"source":["*(text from Sylvain Gugger Transformers Tutorial)* To process this data to train a model, we need to build a `Transform` that will be applied lazily. In a fastai `Transform` you can define:\n","- an `encodes` method that is applied when you call the transform (a bit like the `forward` method in a `nn.Module`)\n","- a `decodes` method that is applied when you call the [decode](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.decode) method of the transform, if you need to decode anything for showing purposes (like converting ids to a text here)\n","- a `setups` method that sets some inner state of the `Transform` (not needed here)"]},{"cell_type":"code","metadata":{"id":"cjTS3O4W9NYi","executionInfo":{"status":"ok","timestamp":1616147051051,"user_tz":-60,"elapsed":1024,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["class TransformersTokenizer(Transform):\n"," def __init__(self, tokenizer): self.tokenizer = tokenizer\n"," def encodes(self, x): \n"," toks = self.tokenizer.tokenize(x)\n"," return tensor(self.tokenizer.convert_tokens_to_ids(toks))\n"," def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))"],"execution_count":39,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"BQrK6KdF9NYk"},"source":["Two comments on the code above:\n","- in `encodes` we don't use the [tokenizer.encode](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.encode) method since it does some additional preprocessing for the model after tokenizing and numericalizing (the aprt throwing a warning before). Here we don't need any post-processing so it's fine to skip it and we use the [tokenizer.tokenize](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.tokenize) method followed by the [tokenizer.convert_tokens_to_ids](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.convert_tokens_to_ids) one.\n","- in `decodes` we return a `TitledStr` object and not just a plain string. That's a fastai class that adds a `show` method to the string, which will allow us to use all the fastai show methods."]},{"cell_type":"markdown","metadata":{"id":"rg1SBjFg9NYl"},"source":["##### Tokenizers"]},{"cell_type":"markdown","metadata":{"id":"7eT6Ao209qhy"},"source":["ENGLISH"]},{"cell_type":"code","metadata":{"id":"BQkp9fdc9NYl","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616147074612,"user_tz":-60,"elapsed":12102,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"254df7fb-53c7-445c-ae36-0a2e351b2406"},"source":["%%time\n","# Load the GPT2 tokenizer in English\n","from transformers import GPT2TokenizerFast, GPT2LMHeadModel\n","pretrained_weights = 'gpt2'\n","tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)\n","model_en = GPT2LMHeadModel.from_pretrained(pretrained_weights)\n","\n","# To correct the warning about token_pad (GPT2TokenizerFast), run the following code\n","# source: https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044\n","tokenizer_en.pad_token = tokenizer_en.eos_token"],"execution_count":40,"outputs":[{"output_type":"stream","text":["CPU times: user 5.84 s, sys: 396 ms, total: 6.23 s\n","Wall time: 10.9 s\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Xvucwww09s0a"},"source":["POLISH"]},{"cell_type":"code","metadata":{"id":"oEDiYVJy9NYo","executionInfo":{"status":"ok","timestamp":1616147074614,"user_tz":-60,"elapsed":11120,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# Get the path to ByteLevelBPE_tokenizer_pt config files\n","ByteLevelBPE_tokenizer_pl_rep = 'ByteLevelBPE_tokenizer_pl'\n","path_to_ByteLevelBPE_tokenizer_pl_rep = path_data/ByteLevelBPE_tokenizer_pl_rep\n","\n","# import the pre-trained GPT2TokenizerFast tokenizer with the tokenizer_pt config files\n","tokenizer_pl = GPT2TokenizerFast.from_pretrained(\n"," str(path_to_ByteLevelBPE_tokenizer_pl_rep), \n"," pad_token='<|endoftext|>')\n","\n","# Get sequence length max of 1024\n","tokenizer_pl.model_max_length = 1024"],"execution_count":41,"outputs":[]},{"cell_type":"code","metadata":{"id":"1lSq0oP0xNvY","executionInfo":{"status":"ok","timestamp":1616147092294,"user_tz":-60,"elapsed":1164,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["tokenizer_pl.model_max_length = 1024"],"execution_count":42,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"lfn3rBNQ9NYr"},"source":["##### Test"]},{"cell_type":"markdown","metadata":{"id":"B-O9MrSA-RS3"},"source":["tokenizer_fastai_en"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"zg5BH0Fh71ge","executionInfo":{"status":"ok","timestamp":1616147100149,"user_tz":-60,"elapsed":4869,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"46bb9904-837f-4409-d5a2-b46df5b38590"},"source":["# Test of the class TransformersTokenizer of fastai with tokenizer_en\n","tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)\n","text = \"Nie masz racji.\"\n","tokens_ids = tokenizer_fastai_en.encodes(text)\n","tokens = tokenizer_fastai_en.tokenizer.convert_ids_to_tokens(tokens_ids)\n","\n","print('input text:',TitledStr(text))\n","print('text tokens:',TitledStr(tokens))\n","print('text tokens_ids:',TitledStr(tokens_ids))\n","print('output text:',TitledStr(tokenizer_fastai_en.decodes(tokens_ids)))"],"execution_count":43,"outputs":[{"output_type":"stream","text":["input text: Nie masz racji.\n","text tokens: ['N', 'ie', 'Ġmas', 'z', 'Ġrac', 'ji', '.']\n","text tokens_ids: tensor([ 45, 494, 12422, 89, 3444, 7285, 13])\n","output text: Nie masz racji.\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"j5BWhClY-OiS"},"source":["tokenizer_fastai_pl"]},{"cell_type":"code","metadata":{"id":"XK_GJzdN9NYu","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616147101037,"user_tz":-60,"elapsed":874,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"a544245a-ef48-431b-e455-f6b1ca5f6cbe"},"source":["# Test of the class TransformersTokenizer of fastai with tokenizer_pl\n","tokenizer_fastai_pl = TransformersTokenizer(tokenizer_pl)\n","text = \"Maybe, you're right\"\n","tokens_ids = tokenizer_fastai_pl.encodes(text)\n","tokens = tokenizer_fastai_pl.tokenizer.convert_ids_to_tokens(tokens_ids)\n","\n","print('input text:',TitledStr(text))\n","print('text tokens:',TitledStr(tokens))\n","print('text tokens_ids:',TitledStr(tokens_ids))\n","print('output text:',TitledStr(tokenizer_fastai_pl.decodes(tokens_ids)))"],"execution_count":44,"outputs":[{"output_type":"stream","text":["input text: Maybe, you're right\n","text tokens: ['Ma', 'y', 'be', ',', 'Ġyou', \"'\", 're', 'Ġri', 'ght']\n","text tokens_ids: tensor([ 2945, 89, 1355, 12, 37025, 7, 299, 23035, 3767])\n","output text: Maybe, you're right\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"jwYVg7fG9NYx"},"source":["#### 4.2 Change vocab embeddings (wte matrix) in the GPT-2 pre-trained model to adapt to the Portuguese vocab"]},{"cell_type":"code","metadata":{"id":"X_L95YjK9NY_","executionInfo":{"status":"ok","timestamp":1616144728967,"user_tz":-60,"elapsed":8349,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# import model if needed\n","from transformers import GPT2LMHeadModel\n","pretrained_weights = 'gpt2'\n","model_en = GPT2LMHeadModel.from_pretrained(pretrained_weights)"],"execution_count":33,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"a5pmnF3d9NZD"},"source":["##### Check vocabs size"]},{"cell_type":"code","metadata":{"id":"A2-zkCQX9NZD","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144728967,"user_tz":-60,"elapsed":3964,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"7daacd37-607e-47fb-c979-0e282d1714cf"},"source":["tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)\n","old_vocab_size = tokenizer_fastai_en.tokenizer.vocab_size\n","\n","tokenizer_fastai_pl = TransformersTokenizer(tokenizer_pl)\n","new_vocab_size = tokenizer_fastai_pl.tokenizer.vocab_size\n","\n","print('old_vocab_size--> {} ,new_vocab_size -->{} diffrence -->{}'.format(old_vocab_size,new_vocab_size,old_vocab_size-new_vocab_size))"],"execution_count":34,"outputs":[{"output_type":"stream","text":["old_vocab_size--> 50257 ,new_vocab_size -->50257 diffrence -->0\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"IB39niAb9NZG"},"source":["##### Check vocabs"]},{"cell_type":"code","metadata":{"id":"YZ6joAZP9NZG","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144729743,"user_tz":-60,"elapsed":1426,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"8f51a90a-cbdd-4acc-dd68-1def5cf2fe0c"},"source":["tokenizer_fastai_vocab_en = tokenizer_fastai_en.tokenizer.get_vocab()\n","tokenizer_fastai_vocab_ls_en = [k for k, v in sorted(tokenizer_fastai_vocab_en.items(), key=lambda item: item[1])]\n","len(tokenizer_fastai_vocab_ls_en),tokenizer_fastai_vocab_ls_en[:10]"],"execution_count":35,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(50257, ['!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*'])"]},"metadata":{"tags":[]},"execution_count":35}]},{"cell_type":"code","metadata":{"id":"BjMtR6ca9NZM","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144730899,"user_tz":-60,"elapsed":1582,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"0e981b15-8800-4fde-f39c-792bafb8600b"},"source":["tokenizer_fastai_vocab_pl = tokenizer_fastai_pl.tokenizer.get_vocab() \n","tokenizer_fastai_vocab_ls_pl = [k for k, v in sorted(tokenizer_fastai_vocab_pl.items(), key=lambda item: item[1])]\n","len(tokenizer_fastai_vocab_ls_pl),tokenizer_fastai_vocab_ls_pl[:10]"],"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(50257, ['<|endoftext|>', '!', '\"', '#', '$', '%', '&', \"'\", '(', ')'])"]},"metadata":{"tags":[]},"execution_count":36}]},{"cell_type":"markdown","metadata":{"id":"Xub9ki-J9NZO"},"source":["##### Changing vocabs and the vocab embeddings matrix (ie, setup new embeddings matrix)"]},{"cell_type":"code","metadata":{"id":"IlwXTr579NZO","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144733685,"user_tz":-60,"elapsed":822,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"0f6e29c5-f988-459c-dfbe-48a1f3d16a05"},"source":["# Check atual weight of wte and lm_head and if wte = lm_head\n","tens_a = model_en.transformer.wte.weight\n","tens_b = model_en.lm_head.weight\n","model_en.transformer.wte.weight,model_en.lm_head.weight,torch.all(tens_a.eq(tens_b))"],"execution_count":37,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(Parameter containing:\n"," tensor([[-0.1101, -0.0393, 0.0331, ..., -0.1364, 0.0151, 0.0453],\n"," [ 0.0403, -0.0486, 0.0462, ..., 0.0861, 0.0025, 0.0432],\n"," [-0.1275, 0.0479, 0.1841, ..., 0.0899, -0.1297, -0.0879],\n"," ...,\n"," [-0.0445, -0.0548, 0.0123, ..., 0.1044, 0.0978, -0.0695],\n"," [ 0.1860, 0.0167, 0.0461, ..., -0.0963, 0.0785, -0.0225],\n"," [ 0.0514, -0.0277, 0.0499, ..., 0.0070, 0.1552, 0.1207]],\n"," requires_grad=True), Parameter containing:\n"," tensor([[-0.1101, -0.0393, 0.0331, ..., -0.1364, 0.0151, 0.0453],\n"," [ 0.0403, -0.0486, 0.0462, ..., 0.0861, 0.0025, 0.0432],\n"," [-0.1275, 0.0479, 0.1841, ..., 0.0899, -0.1297, -0.0879],\n"," ...,\n"," [-0.0445, -0.0548, 0.0123, ..., 0.1044, 0.0978, -0.0695],\n"," [ 0.1860, 0.0167, 0.0461, ..., -0.0963, 0.0785, -0.0225],\n"," [ 0.0514, -0.0277, 0.0499, ..., 0.0070, 0.1552, 0.1207]],\n"," requires_grad=True), tensor(True))"]},"metadata":{"tags":[]},"execution_count":37}]},{"cell_type":"code","metadata":{"id":"o6hgW8cZ9NZR","executionInfo":{"status":"ok","timestamp":1616144736642,"user_tz":-60,"elapsed":989,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["# Get weights of the old wte\n","old_wgts = model_en.transformer.get_input_embeddings().weight.clone().detach()\n","\n","# Get the mean embedding vetor of the old wte\n","wgts_m = old_wgts.mean(0)\n","\n","# Initialize vocab size and weights of the new wte\n","new_vocab_size = tokenizer_fastai_pl.tokenizer.vocab_size\n","new_wgts = old_wgts.new_zeros(new_vocab_size,old_wgts.size(1))"],"execution_count":38,"outputs":[]},{"cell_type":"code","metadata":{"id":"ECyfPCxxRBu-","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144737156,"user_tz":-60,"elapsed":610,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"8310adc0-a7df-4983-ccc4-7553590b77f7"},"source":["path_data"],"execution_count":39,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Path('/root/.fastai/data/plwiki')"]},"metadata":{"tags":[]},"execution_count":39}]},{"cell_type":"markdown","metadata":{"id":"g2GkJTiQ9NZT"},"source":["**Save**"]},{"cell_type":"code","metadata":{"id":"Ex14pd0c9NZU","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616144743738,"user_tz":-60,"elapsed":2274,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"c01c6f8e-2055-498d-d97a-df26ca302efa"},"source":["# Get the new wte keeping the embeddings vetors of tokens in common in the 2 vocabs\n","# A token present in the new vocab but not in the old one gets the mean embedding vetor of the old wte\n","old_vocab = tokenizer_fastai_en.tokenizer.get_vocab()\n","new_vocab = tokenizer_fastai_pl.tokenizer.get_vocab()\n","same_tokens_list = list()\n","different_tokens_list = list()\n"," \n","for w,idx_new in new_vocab.items(): \n"," idx_old = old_vocab.get(w, -1)\n"," if idx_old>=0:\n"," new_wgts[idx_new] = old_wgts[idx_old]\n"," same_tokens_list.append((w,idx_new))\n"," else:\n"," new_wgts[idx_new] = wgts_m\n"," different_tokens_list.append((w,idx_new))\n","\n","# setup in model the new wte\n","new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))\n","#new_wte.weight.data.normal_(mean=0.0, std=model.config.initializer_range)\n","new_wte.weight.data = new_wgts\n","model_en.transformer.set_input_embeddings(new_wte)\n","print(f'Polish wte matrix setup done!\\n\\nWe kept {len(same_tokens_list)} embeddings vectors from the English one.\\nWe did not kept {len(different_tokens_list)} embeddings vectors from the English one (instead, we used the old wte mean vector).\\n')\n","\n","# Check identical tokens between the 2 vocabs \n","num = 15\n","print(f'{num} first tokens IN common between the 2 vocabs:\\n{same_tokens_list[:num]}\\n')\n","print(f'{num} first tokens NOT in common between the 2 vocabs:\\n{different_tokens_list[:num]}')\n","\n","# save new_wgts\n","torch.save(new_wgts, path_data/'new_wte_wgts.pl')\n","# save same_tokens_list and different_tokens_list\n","torch.save(same_tokens_list, path_data/'same_tokens_list.pl')\n","torch.save(different_tokens_list, path_data/'different_tokens_list.pl')"],"execution_count":40,"outputs":[{"output_type":"stream","text":["Polish wte matrix setup done!\n","\n","We kept 7725 embeddings vectors from the English one.\n","We did not kept 42532 embeddings vectors from the English one (instead, we used the old wte mean vector).\n","\n","15 first tokens IN common between the 2 vocabs:\n","[('ĠJud', 22904), ('ĠSab', 42367), ('ĠAnge', 5618), ('1', 17), ('ĠTin', 38533), ('ĠCook', 40773), ('ĠOne', 12435), ('Ġsale', 19760), ('ĠRun', 28577), ('én', 20218), ('ras', 7778), ('ĠEth', 40490), ('ĠEk', 4341), ('arn', 43204), ('ĠFin', 5592)]\n","\n","15 first tokens NOT in common between the 2 vocabs:\n","[('udio', 16969), ('ĠgaÅĤÄħ', 19241), ('ĠCechÄħ', 39503), ('ĠOlgi', 48836), ('ĠTrzebi', 25840), ('szyÄĩ', 7428), ('Ġewangelickiej', 36500), ('Ġpriorytet', 41018), ('ĠBrooklynie', 49683), ('ĠÅļwiatowej', 17951), ('ĠuczestniczÄħ', 24890), ('Ġkursów', 21245), ('ĠBost', 14634), ('zachodniego', 41008), ('ĠZiemiÄħ', 48258)]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_Ne5z695_ZTF","executionInfo":{"status":"ok","timestamp":1616144752872,"user_tz":-60,"elapsed":850,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}},"outputId":"e0f9e43b-0dd3-487f-96cb-762c8747d278"},"source":["ls -all '/root/.fastai/data/plwiki'"],"execution_count":41,"outputs":[{"output_type":"stream","text":["total 2300124\n","drwxr-xr-x 3 root root 4096 Mar 19 09:05 \u001b[0m\u001b[01;34m.\u001b[0m/\n","drwxr-xr-x 3 root root 4096 Mar 19 08:57 \u001b[01;34m..\u001b[0m/\n","-rw------- 1 root root 1101183658 Mar 19 08:57 all_texts_plwiki.csv\n","-rw------- 1 root root 1098323868 Mar 19 08:58 all_texts_plwiki.txt\n","drwxr-xr-x 2 root root 4096 Mar 19 08:59 \u001b[01;34mByteLevelBPE_tokenizer_pl\u001b[0m/\n","-rw-r--r-- 1 root root 1216559 Mar 19 09:05 different_tokens_list.pl\n","-rw-r--r-- 1 root root 154390264 Mar 19 09:05 new_wte_wgts.pl\n","-rw-r--r-- 1 root root 182831 Mar 19 09:05 same_tokens_list.pl\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Atp10nyc_ZQs","executionInfo":{"status":"ok","timestamp":1616147121409,"user_tz":-60,"elapsed":2130,"user":{"displayName":"Marek Leszczynski","photoUrl":"","userId":"05664549655810509768"}}},"source":["!cp /root/.fastai/data/plwiki/new_wte_wgts.pl /content/gdrive/MyDrive/fastai\n","!cp /root/.fastai/data/plwiki/different_tokens_list.pl /content/gdrive/MyDrive/fastai\n","!cp /root/.fastai/data/plwiki/same_tokens_list.pl /content/gdrive/MyDrive/fastai"],"execution_count":45,"outputs":[]},{"cell_type":"code","metadata":{"id":"qV4l1zID_ZNr"},"source":[],"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/wiki_download.ipynb b/wiki_download.ipynb new file mode 100644 index 0000000..12eee5c --- /dev/null +++ b/wiki_download.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"01_wiki_download.ipynb","provenance":[{"file_id":"1qOb3NkwpJE3pzxMGk1qgnvQsc63AEvsy","timestamp":1615886943370}],"collapsed_sections":[],"toc_visible":true,"authorship_tag":"ABX9TyOQgEqtyrdaHDu6p3QewScT"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"Cg6_0Te6XQfI"},"source":["# Downloading wiki to csv file"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CZGLl0TuXPSC","executionInfo":{"status":"ok","timestamp":1615887007555,"user_tz":-60,"elapsed":23917,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"9ff4fbf5-fa87-4c5b-af78-9481ad477893"},"source":["#start by mounting google drive\r\n","from google.colab import drive, files\r\n","drive.mount('/content/gdrive', force_remount=True)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Mounted at /content/gdrive\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Vj-2UbtHY47a"},"source":["# 1. Installing required libraries"]},{"cell_type":"code","metadata":{"id":"xiPsWe_2XbBX","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1615887384482,"user_tz":-60,"elapsed":169538,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"be26d1e9-4c09-4e75-dd26-2b0387565699"},"source":["# need to instal fastai 2 etc before \r\n","!pip install -q git+https://github.com/fastai/fastai\r\n","!pip install -q git+https://github.com/fastai/fastcore\r\n","!pip install -q iterative-stratification"],"execution_count":null,"outputs":[{"output_type":"stream","text":["\u001b[K |████████████████████████████████| 61kB 4.3MB/s \n","\u001b[K |████████████████████████████████| 12.8MB 9.8MB/s \n","\u001b[K |████████████████████████████████| 776.8MB 23kB/s \n","\u001b[?25h Building wheel for fastai (setup.py) ... \u001b[?25l\u001b[?25hdone\n","\u001b[31mERROR: torchtext 0.9.0 has requirement torch==1.8.0, but you'll have torch 1.7.1 which is incompatible.\u001b[0m\n"," Building wheel for fastcore (setup.py) ... \u001b[?25l\u001b[?25hdone\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":338},"id":"nULo_1TuZTVM","executionInfo":{"status":"error","timestamp":1615887946761,"user_tz":-60,"elapsed":588,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"e7872676-7495-4f78-d01d-5edbb6dfef92"},"source":["from fastai.text.all import *\r\n","# from nlputils_fastai import *"],"execution_count":null,"outputs":[{"output_type":"error","ename":"ModuleNotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mfastai\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mall\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mnlputils_fastai\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nlputils_fastai'","","\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"]}]},{"cell_type":"markdown","metadata":{"id":"w7wM6U0WYA-S"},"source":["Ready libraries are inside the fastai package. and ...\r\n","There is need to import old library /content/gdrive/MyDrive/fastai/nlputilsfastai2.py for wiki extractor /content/gdrive/MyDrive/fastai/WikiExtractor.py\r\n","\r\n","\r\n","```\r\n","from fastai.basics import * # was fastai2\r\n","```\r\n"]},{"cell_type":"markdown","metadata":{"id":"kQl0bIwkc3rP"},"source":["# 2. Checking config"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qendlFDhY0jd","executionInfo":{"status":"ok","timestamp":1615887454607,"user_tz":-60,"elapsed":717,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"c64a5796-9983-4d48-868d-08fcfb9f75f9"},"source":["# Get config of paths\r\n","config = Config()\r\n","config.d"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'archive_path': '/root/.fastai/archive',\n"," 'data_path': '/root/.fastai/data',\n"," 'model_path': '/root/.fastai/models',\n"," 'storage_path': '/tmp',\n"," 'version': 2}"]},"metadata":{"tags":[]},"execution_count":8}]},{"source":["This will create a {lang}wiki folder, containing a {lang}wiki text file with the wikipedia contents (for other languages, replace {lang} with the appropriate code from the list of wikipedias)."],"cell_type":"markdown","metadata":{}},{"cell_type":"code","metadata":{"id":"yRIGbO2VZFXP"},"source":["# setup new path_data and create the corresponding folder\r\n","lang = 'pl'\r\n","name = f'{lang}wiki'\r\n","data_path = config['data_path']\r\n","path_data = data_path/name\r\n","path_data.mkdir(exist_ok=True, parents=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vi96awP0Z9dz","executionInfo":{"status":"ok","timestamp":1615887589987,"user_tz":-60,"elapsed":695,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"890407b9-04a9-45e7-db9f-b66f6496e6d0"},"source":["data_path, path_data\r\n"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(Path('/root/.fastai/data'), Path('/root/.fastai/data/plwiki'))"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"code","metadata":{"id":"hR1DYxhUa8sM"},"source":["#copying old library and extractor to fastai directory\r\n","!cp /content/gdrive/MyDrive/fastai/nlputilsfastai.py /root/.fastai/data/plwiki\r\n","!cp /content/gdrive/MyDrive/fastai/WikiExtractor.py /root/.fastai/data/plwiki/wikiextractor/"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DSWBUty9bzg5","executionInfo":{"status":"ok","timestamp":1615889886286,"user_tz":-60,"elapsed":625,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"bd4ba25a-47f6-4c0e-f71a-0b6658cd8ffc"},"source":["ls -sll /root/.fastai/data/plwiki"],"execution_count":null,"outputs":[{"output_type":"stream","text":["total 11318064\n"," 8 -rw------- 1 root root 4538 Mar 16 10:11 nlputilsfastai.py\n","9337464 -rw-r--r-- 1 root root 9561555601 Mar 16 10:07 plwiki-latest-pages-articles.xml\n","1980460 -rw-r--r-- 1 root root 2027983294 Mar 16 09:57 plwiki-latest-pages-articles.xml.bz2\n"," 4 drwxr-xr-x 2 root root 4096 Mar 16 09:50 \u001b[0m\u001b[01;34m__pycache__\u001b[0m/\n"," 4 drwxr-xr-x 5 root root 4096 Mar 16 10:07 \u001b[01;34mwikiextractor\u001b[0m/\n"," 124 -rw------- 1 root root 123111 Mar 16 10:11 WikiExtractor.py\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"p2kIHhuycHlU"},"source":["!cd /root/.fastai/data/plwiki"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"z65_h_sXcTdm"},"source":["from nlputilsfastai import * #this time it should be OK"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gseX4EVPm681"},"source":["# 3. Download PL Wikipedia\r\n"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"gYJtgrPmacQl","executionInfo":{"status":"ok","timestamp":1615891580232,"user_tz":-60,"elapsed":1495804,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"38e879ab-b4f7-409d-b83d-792db658ce6f"},"source":["get_wiki(path_data,lang)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["extracting...\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"11ha3vxf6S9b"},"source":[],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FzJeD35PhZSW","executionInfo":{"status":"ok","timestamp":1615891671670,"user_tz":-60,"elapsed":846,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"d790b4ef-503b-4bb5-bccc-6e0c879f3cee"},"source":["ls -sll /root/.fastai/data/plwiki/wikiextractor/"],"execution_count":null,"outputs":[{"output_type":"stream","text":["total 180\n"," 4 -rwxr-xr-x 1 root root 799 Mar 16 10:07 \u001b[0m\u001b[01;32mextract.sh\u001b[0m*\n"," 36 -rw-r--r-- 1 root root 34523 Mar 16 10:07 LICENSE\n"," 4 drwxr-xr-x 2 root root 4096 Mar 16 10:21 \u001b[01;34m__pycache__\u001b[0m/\n"," 8 -rw-r--r-- 1 root root 6517 Mar 16 10:07 README.md\n"," 4 -rw-r--r-- 1 root root 1321 Mar 16 10:07 setup.py\n"," 4 drwxr-xr-x 2 root root 4096 Mar 16 10:07 \u001b[01;34mwikiextractor\u001b[0m/\n","120 -rw------- 1 root root 119800 Mar 16 10:21 WikiExtractor.py\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"BoEimYTNdk3d"},"source":["If `get_wiki(path_data,lang)` breaks, fix the download manually no terminal:\r\n","/root/.fastai/data/plwiki\r\n","\r\n","- mkdir -p /root/.fastai/data/plwiki\r\n","- cd /root/.fastai/data/plwiki\r\n","- wget -c https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles.xml.bz2\r\n","- bzip2 -dk plwiki-latest-pages-articles.xml.bz2\r\n","\r\n","And re-run `get_wiki(path_data,lang)` once the download is successful."]},{"cell_type":"markdown","metadata":{"id":"UwFDe467feQd"},"source":["Path for original notebook \r\n","https://github.com/piegu/fastai-projects/blob/master/finetuning-English-GPT2-any-language-Portuguese-HuggingFace-fastaiv2_FAST.ipynb"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C016KuXUpsfx","executionInfo":{"status":"ok","timestamp":1615891715301,"user_tz":-60,"elapsed":705,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"7b867d72-7c84-4291-d648-96f4df02a47b"},"source":["!head -n4 {path_data}/{name}"],"execution_count":null,"outputs":[{"output_type":"stream","text":["\n","AWK\n","\n","AWK – interpretowany język programowania, którego główną funkcją jest wyszukiwanie i przetwarzanie wzorców w plikach lub strumieniach danych. Jest także nazwą programu początkowo dostępnego dla systemów operacyjnych będących pochodnymi UNIX-a, obecnie także na inne platformy.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"m33O8VJqptHF","executionInfo":{"status":"ok","timestamp":1615891771452,"user_tz":-60,"elapsed":34156,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"04d188a6-654a-4e9d-ed55-b71c1e00335b"},"source":["dest = split_wiki(path_data,lang)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["0\n","100000\n","200000\n","300000\n","400000\n","500000\n","600000\n","700000\n","800000\n","900000\n","1000000\n","1100000\n","1200000\n","1300000\n","1400000\n","1500000\n","1600000\n","1700000\n","1800000\n","1900000\n","2000000\n","2100000\n","2200000\n","2300000\n","2400000\n","2500000\n","2600000\n","2700000\n","2800000\n","2900000\n","3000000\n","3100000\n","3200000\n","3300000\n","3400000\n","3500000\n","3600000\n","3700000\n","3800000\n","3900000\n","4000000\n","4100000\n","4200000\n","4300000\n","4400000\n","4500000\n","4600000\n","4700000\n","4800000\n","4900000\n","5000000\n","5100000\n","5200000\n","5300000\n","5400000\n","5500000\n","5600000\n","5700000\n","5800000\n","5900000\n","6000000\n","6100000\n","6200000\n","6300000\n","6400000\n","6500000\n","6600000\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"G-wZeaDpp2-P","executionInfo":{"status":"ok","timestamp":1615892782534,"user_tz":-60,"elapsed":2697,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"b7fe4889-e3d3-4c8e-fffc-6fd5c6ba15d7"},"source":["# articles are being stored as many txt files( here 229610) in doc folder\r\n","dest = path_data/'docs'\r\n","for file in dest.ls()[:15]:\r\n"," print(file)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/root/.fastai/data/plwiki/docs/Vanquish (gra komputerowa).txt\n","/root/.fastai/data/plwiki/docs/Książka obrazkowa.txt\n","/root/.fastai/data/plwiki/docs/Gilles Simon.txt\n","/root/.fastai/data/plwiki/docs/Wsiewołod Meyerhold.txt\n","/root/.fastai/data/plwiki/docs/Alexandre-Édouard Kierzkowski.txt\n","/root/.fastai/data/plwiki/docs/Legenda: Wojna Smoków.txt\n","/root/.fastai/data/plwiki/docs/Kościół św. Mikołaja w Papowie Biskupim.txt\n","/root/.fastai/data/plwiki/docs/Bad Dürkheim.txt\n","/root/.fastai/data/plwiki/docs/Claude Vivier.txt\n","/root/.fastai/data/plwiki/docs/Loricariinae.txt\n","/root/.fastai/data/plwiki/docs/Linia tramwajowa Halle-Ammendorf–Bad Dürrenberg.txt\n","/root/.fastai/data/plwiki/docs/Walter Nemitz.txt\n","/root/.fastai/data/plwiki/docs/Iwan Rusak.txt\n","/root/.fastai/data/plwiki/docs/Olga Lepieszynska (biolog).txt\n","/root/.fastai/data/plwiki/docs/Paul Balzereit.txt\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4Qkw08LvqDUs","executionInfo":{"status":"ok","timestamp":1615891878334,"user_tz":-60,"elapsed":30791,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"67d50340-07c3-4a22-d616-0db1e1c6629f"},"source":["%%time\r\n","# Size of downloaded data in the docs folder\r\n","num_files, num_tokens = get_num_tokens(dest)\r\n","print(f'{num_files} files - {num_tokens} tokens')"],"execution_count":null,"outputs":[{"output_type":"stream","text":["229610 files - 144194849 tokens\n","CPU times: user 21.1 s, sys: 5.09 s, total: 26.2 s\n","Wall time: 27.5 s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Fz1EmUPmqtVk","executionInfo":{"status":"ok","timestamp":1615892047470,"user_tz":-60,"elapsed":66983,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"0602ed93-b932-49cb-f189-9f5e8e76cecc"},"source":["%%time\r\n","get_one_clean_file(dest,lang)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["0\n","1000\n","2000\n","3000\n","4000\n","5000\n","6000\n","7000\n","8000\n","9000\n","10000\n","11000\n","12000\n","13000\n","14000\n","15000\n","16000\n","17000\n","18000\n","19000\n","20000\n","21000\n","22000\n","23000\n","24000\n","25000\n","26000\n","27000\n","28000\n","29000\n","30000\n","31000\n","32000\n","33000\n","34000\n","35000\n","36000\n","37000\n","38000\n","39000\n","40000\n","41000\n","42000\n","43000\n","44000\n","45000\n","46000\n","47000\n","48000\n","49000\n","50000\n","51000\n","52000\n","53000\n","54000\n","55000\n","56000\n","57000\n","58000\n","59000\n","60000\n","61000\n","62000\n","63000\n","64000\n","65000\n","66000\n","67000\n","68000\n","69000\n","70000\n","71000\n","72000\n","73000\n","74000\n","75000\n","76000\n","77000\n","78000\n","79000\n","80000\n","81000\n","82000\n","83000\n","84000\n","85000\n","86000\n","87000\n","88000\n","89000\n","90000\n","91000\n","92000\n","93000\n","94000\n","95000\n","96000\n","97000\n","98000\n","99000\n","100000\n","101000\n","102000\n","103000\n","104000\n","105000\n","106000\n","107000\n","108000\n","109000\n","110000\n","111000\n","112000\n","113000\n","114000\n","115000\n","116000\n","117000\n","118000\n","119000\n","120000\n","121000\n","122000\n","123000\n","124000\n","125000\n","126000\n","127000\n","128000\n","129000\n","130000\n","131000\n","132000\n","133000\n","134000\n","135000\n","136000\n","137000\n","138000\n","139000\n","140000\n","141000\n","142000\n","143000\n","144000\n","145000\n","146000\n","147000\n","148000\n","149000\n","150000\n","151000\n","152000\n","153000\n","154000\n","155000\n","156000\n","157000\n","158000\n","159000\n","160000\n","161000\n","162000\n","163000\n","164000\n","165000\n","166000\n","167000\n","168000\n","169000\n","170000\n","171000\n","172000\n","173000\n","174000\n","175000\n","176000\n","177000\n","178000\n","179000\n","180000\n","181000\n","182000\n","183000\n","184000\n","185000\n","186000\n","187000\n","188000\n","189000\n","190000\n","191000\n","192000\n","193000\n","194000\n","195000\n","196000\n","197000\n","198000\n","199000\n","200000\n","201000\n","202000\n","203000\n","204000\n","205000\n","206000\n","207000\n","208000\n","209000\n","210000\n","211000\n","212000\n","213000\n","214000\n","215000\n","216000\n","217000\n","218000\n","219000\n","220000\n","221000\n","222000\n","223000\n","224000\n","225000\n","226000\n","227000\n","228000\n","229000\n","all texts from wikipedia pl in the file /root/.fastai/data/plwiki/all_texts_plwiki.txt\n","\n","CPU times: user 41.1 s, sys: 13.2 s, total: 54.3 s\n","Wall time: 1min 6s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XhjmUCc3rbcR","executionInfo":{"status":"ok","timestamp":1615892266053,"user_tz":-60,"elapsed":96622,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"447cae75-d4da-44e7-a4a1-ceca911cbe98"},"source":["%%time\r\n","get_one_clean_csv_file(dest,lang)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["all texts from wikipedia pl in the file /root/.fastai/data/plwiki/all_texts_plwiki.csv\n","\n","CPU times: user 1min 2s, sys: 8.58 s, total: 1min 11s\n","Wall time: 1min 35s\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"lQbc9BGDrGGL","executionInfo":{"status":"ok","timestamp":1615892270787,"user_tz":-60,"elapsed":1000,"user":{"displayName":"Mark Lina","photoUrl":"","userId":"17651129667533642938"}},"outputId":"ce244c44-f49b-4c44-e809-f9acf08d0c7f"},"source":["ls -sll /root/.fastai/data/plwiki"],"execution_count":null,"outputs":[{"output_type":"stream","text":["total 14578952\n","1075380 -rw-r--r-- 1 root root 1101183658 Mar 16 10:57 all_texts_plwiki.csv\n","1072588 -rw-r--r-- 1 root root 1098323868 Mar 16 10:54 all_texts_plwiki.txt\n"," 11108 drwxr-xr-x 2 root root 11370496 Mar 16 10:49 \u001b[0m\u001b[01;34mdocs\u001b[0m/\n"," 0 -rw-r--r-- 1 root root 0 Mar 16 10:21 log\n"," 8 -rw------- 1 root root 4538 Mar 16 10:21 nlputilsfastai.py\n","1101812 -rw-r--r-- 1 root root 1128249870 Mar 16 10:46 plwiki\n","9337464 -rw-r--r-- 1 root root 9561555601 Mar 16 10:07 plwiki-latest-pages-articles.xml\n","1980460 -rw-r--r-- 1 root root 2027983294 Mar 16 09:57 plwiki-latest-pages-articles.xml.bz2\n"," 4 drwxr-xr-x 2 root root 4096 Mar 16 09:50 \u001b[01;34m__pycache__\u001b[0m/\n"," 4 drwxr-xr-x 6 root root 4096 Mar 16 10:21 \u001b[01;34mwikiextractor\u001b[0m/\n"," 124 -rw------- 1 root root 123111 Mar 16 10:11 WikiExtractor.py\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Ix7Of2bVr5XP"},"source":["!cp /root/.fastai/data/plwiki/all_texts_plwiki.csv /content/gdrive/MyDrive/fastai\r\n","!cp /root/.fastai/data/plwiki/all_texts_plwiki.txt /content/gdrive/MyDrive/fastai"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"yzX4fM94ul-7"},"source":["# Saving to hdf5 file for later use if there is need"]},{"cell_type":"code","metadata":{"id":"gajmqk9luCPp"},"source":["df = pd.read_csv('/content/gdrive/MyDrive/fastai/all_texts_plwiki.csv')\r\n","df.to_hdf('/content/gdrive/MyDrive/fastai/pl-wiki.h5', key='df', mode='w')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"W8baYgzpubte"},"source":[],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UxniNZlcqC--"},"source":["Now when having text to pump into GPT lets go there..."]}]} \ No newline at end of file