diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 6d7b8171..a4f0bc6e 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -8,16 +8,9 @@ WORKDIR /app ENV POETRY_HOME=/opt/poetry ENV POETRY_VENV=/opt/poetry-venv ENV POETRY_CACHE_DIR=/opt/.cache -ENV DOTNET_ROLL_FORWARD=LatestMajor ENV PIP_DISABLE_PIP_VERSION_CHECK=on ENV TZ=America/New_York RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -# Install .NET SDK -RUN apt-get update -RUN apt-get install --no-install-recommends -y wget -RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \ - dpkg -i packages-microsoft-prod.deb && \ - rm packages-microsoft-prod.deb # Install apt packages RUN apt-get update RUN apt-get upgrade -y @@ -29,8 +22,7 @@ RUN apt-get install --no-install-recommends -y \ build-essential \ gdb \ curl \ - unzip \ - dotnet-sdk-7.0 + unzip # Make some useful symlinks that are expected to exist RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \ ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python @@ -38,14 +30,18 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \ RUN python3 -m venv $POETRY_VENV \ && $POETRY_VENV/bin/pip install -U pip setuptools \ && $POETRY_VENV/bin/pip install poetry==${POETRY_VERSION} -# Add `poetry` to PATH +# Add `poetry` to PATH and configure ENV PATH="${PATH}:${POETRY_VENV}/bin" -# Install AWS CLI -RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ - unzip awscliv2.zip && \ - ./aws/install && \ - rm awscliv2.zip -RUN rm -rf /var/lib/apt/lists/* RUN poetry config virtualenvs.create true && \ poetry config virtualenvs.in-project true +# Clean up +RUN rm -rf /var/lib/apt/lists/* +# Create caches +RUN mkdir -p /root/.cache/silnlp/experiments +RUN mkdir /root/.cache/silnlp/projects +ENV SIL_NLP_CACHE_EXPERIMENT_DIR=/root/.cache/silnlp/experiments +ENV SIL_NLP_CACHE_PROJECT_DIR=/root/.cache/silnlp/projects +# Set environment variables +ENV CLEARML_API_HOST="https://api.sil.hosted.allegro.ai" +ENV SIL_NLP_DATA_PATH=/aqua-ml-data CMD ["bash"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b860b5fa..5d63b6cd 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -14,7 +14,7 @@ "-v", "${env:HOME}/.aws:/root/.aws", // Mount user's AWS credentials into the container "-v", - "/home/clearml/.clearml/hf-cache:/root/.cache/huggingface" + "${env:HOME}/clearml/.clearml/hf-cache:/root/.cache/huggingface" ], "containerEnv": { "AWS_REGION": "${localEnv:AWS_REGION}", @@ -44,7 +44,10 @@ }, "editor.formatOnSave": true, "editor.formatOnType": true, - "isort.args":["--profile", "black"] + "isort.args": [ + "--profile", + "black" + ] }, // Add the IDs of extensions you want installed when the container is created. "extensions": [ diff --git a/Dockerfile b/Dockerfile index 4a1c3fcb..387b76e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,14 +55,6 @@ RUN apt-get install --no-install-recommends -y \ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \ ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python -# Install .NET SDK -RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \ - dpkg -i packages-microsoft-prod.deb && \ - rm packages-microsoft-prod.deb -RUN apt-get update && \ - apt-get install --no-install-recommends -y dotnet-sdk-7.0 -ENV DOTNET_ROLL_FORWARD=LatestMajor - # Install dependencies from poetry COPY --from=builder /src/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt @@ -105,11 +97,11 @@ RUN mv meteor-1.5/meteor-1.5.jar /usr/local/bin RUN rm -rf meteor-1.5 ENV METEOR_PATH=/usr/local/bin -# Other environment variables -ENV SIL_NLP_DATA_PATH=/aqua-ml-data -RUN mkdir -p .cache/silnlp -ENV SIL_NLP_CACHE_EXPERIMENT_DIR=/root/.cache/silnlp -ENV CLEARML_API_HOST="https://api.sil.hosted.allegro.ai" +# Create caches +RUN mkdir -p .cache/silnlp/experiments +RUN mkdir .cache/silnlp/projects +ENV SIL_NLP_CACHE_EXPERIMENT_DIR=/root/.cache/silnlp/experiments +ENV SIL_NLP_CACHE_PROJECT_DIR=/root/.cache/silnlp/projects # Clone silnlp and make it the starting directory RUN git clone https://github.com/sillsdev/silnlp.git diff --git a/README.md b/README.md index 6c728f05..75131936 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,13 @@ SIL NLP provides a set of pipelines for performing experiments on various NLP ta --- ## SILNLP Prerequisites -These are the main requirements for the SILNLP code to run on a local machine. Using PyCharm is another way to configure the environment and instructions for that method are included later. -The SILNLP repo itself is hosted on Github, mainly written in Python and calls SIL.Machine.Tool. 'Machine' as we tend to call it, is a .NET application that has many functions for manipulating USFM data. Most of the language data we have for low resource languages in USFM format. Since Machine is a .Net application it depends upon the __.NET core SDK__ which works on Windows and Linux. Since there are many python packages that need to be used, with complex versioning requirements we use a Python package called Poetry to mangage all of those. So here is a rough heirarchy of SILNLP with the major dependencies. +These are the main requirements for the SILNLP code to run on a local machine. Since there are many Python packages that need to be used with complex versioning requirements, we use a Python package called Poetry to mangage all of those. So here is a rough heirarchy of SILNLP with the major dependencies. | Requirement | Reason | | --------------------- | ----------------------------------------------------------------- | | GIT | to get the repo from [github](https://github.com/sillsdev/silnlp) | | Python | to run the silnlp code | | Poetry | to manage all the Python packages and versions | -| SIL.Machine.Tool | to support many functions for data manipulation | -| .Net core SDK | Required by SIL.Machine.Tool | | NVIDIA GPU | Required to run on a local machine | | Nvidia drivers | Required for the GPU | | CUDA Toolkit | Required for the Machine learning with the GPU | @@ -27,7 +24,6 @@ The SILNLP repo itself is hosted on Github, mainly written in Python and calls S ## Environment Setup -### Option 1: Docker Container 1. If using a local GPU, install the corresponding [NVIDIA driver](https://www.nvidia.com/download/index.aspx). On Ubuntu, the driver can alternatively be installed through the GUI by opening Software & Updates, navigating to Additional Drivers in the top menu, and selecting the newest NVIDIA driver with the labels proprietary and tested. @@ -35,8 +31,10 @@ The SILNLP repo itself is hosted on Github, mainly written in Python and calls S After installing the driver, reboot your system. 2. Download and install [Docker Desktop](https://www.docker.com/get-started/). + * If using Linux (not WSL), add your user to the docker group by using a terminal to run: `sudo usermod -aG docker $USER` * Reboot after installing, confirm that all installation steps are complete before the next step. -4. Pull Docker image + +3. Pull Docker image In a terminal, run: ``` @@ -45,7 +43,7 @@ The SILNLP repo itself is hosted on Github, mainly written in Python and calls S * For Windows, use CMD Prompt * If there is an error like "request returned Internal Server Error for API route and version , check if the server supports the requested API version" Check that the Docker Desktop installation steps are complete. Reopen CMD prompt and try again. -5. Create Docker container based on the image +4. Create Docker container based on the image If you're using a local GPU, then in a terminal, run: ``` @@ -57,227 +55,96 @@ The SILNLP repo itself is hosted on Github, mainly written in Python and calls S ``` A docker container should be created. You should be able to see a container named 'silnlp' on the Containers page of Docker Desktop. -6. Create file for environment variables - - __If you do not intend to use SILNLP with ClearML and AWS, you can skip this step.__ +5. Create file for environment variables - Create a text file with the following content and insert your credentials. + Create a text file with the following content and edit as necessary: ``` + CLEARML_API_HOST="https://api.sil.hosted.allegro.ai" CLEARML_API_ACCESS_KEY=xxxxx CLEARML_API_SECRET_KEY=xxxxx + AWS_REGION="us-east-1" AWS_ACCESS_KEY_ID=xxxxx AWS_SECRET_ACCESS_KEY=xxxxx + SIL_NLP_DATA_PATH="/aqua-ml-data" ``` + * If you do not intend to use SILNLP with ClearML and/or AWS, you can leave out the respective variables. If you need to generate ClearML credentials, see [ClearML setup](clear_ml_setup.md). * Note that this does not give you direct access to an AWS S3 bucket from within the Docker container, it only allows you to run scripts referencing files in the bucket. -7. Start container +6. Start container - If you completed step 5: \ In a terminal, run: ``` docker start silnlp docker exec -it --env-file path/to/env_vars_file silnlp bash ``` - If you did not complete step 5: \ - In a terminal, run: - ``` - docker start silnlp - docker exec -it silnlp bash - ``` + * After this step, the terminal should change to say `root@xxxxx:~/silnlp#`, where `xxxxx` is a string of letters and numbers, instead of your current working directory. This is the command line for the docker container, and you're able to run SILNLP scripts from here. * To leave the container, run `exit`, and to stop it, run `docker stop silnlp`. It can be started again by repeating step 6. Stopping the container will not erase any changes made in the container environment, but removing it will. -### Option 2: Manual Installation - -The SILNLP code can be run on either Windows or Linux operating systems. If using an Ubuntu distribution, the only compatible version is 20.04. +## Development Environment Setup -__Download and install__ the following before creating any projects or starting any code, preferably in this order to avoid most warnings: +Follow the instructions below to set up a Dev Container in VS Code. This is the recommended way to develop in SILNLP. For manual setup, see [Manual Setup](manual_setup.md). -1. If using a local GPU: [NVIDIA driver](https://www.nvidia.com/download/index.aspx) +1. If using a local GPU, install the corresponding [NVIDIA driver](https://www.nvidia.com/download/index.aspx). * On Ubuntu, the driver can alternatively be installed through the GUI by opening Software & Updates, navigating to Additional Drivers in the top menu, and selecting the newest NVIDIA driver with the labels proprietary and tested. * After installing the driver, reboot your system. -2. [Git](https://git-scm.com/downloads) -3. [Python 3.7](https://www.python.org/downloads/) (latest minor version, ie 3.7.9) - * Will also work with Python 3.8, but not Python 3.9 because of a [llvmlite incompatability](https://stackoverflow.com/questions/65798319/llvmlite-failed-to-install-error-building-llvmlite) - * Can alternatively install Python using [miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.html) if you're planning to use more than one version of Python. If following this method, activate your conda environment before installing Poetry. -4. [Poetry](https://python-poetry.org/docs/#installation) - * Note that whether the command should call python or python3 depends on which is required on your machine. - * It may (or may not) be possible to run the curl command within a VSCode terminal. If that causes permission errors close VS Code and try it in an elevated CMD prompt. - - - Windows: - - At an administrator CMD prompt or a terminal within VSCode run: - ``` - curl -sSL https://install.python-poetry.org | python - --version 1.2.2 - ``` - - - - In Powershell, run: - ``` - (Invoke-WebRequest -Uri https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py -UseBasicParsing).Content | python - ``` +2. Download and install [Docker Desktop](https://www.docker.com/get-started/). + * Reboot after installing and completing the relevant steps below, confirm that all installation steps are complete before the next step. + * Linux users should have no additional steps given that they follow Docker's distribution-specific setup. + Windows (non-WSL) and macOS: + * Open Settings in Docker Desktop and under the Resources tab, update File Sharing with any locations your source code is kept. + + WSL: + * Enable WSL 2 backend: + * Open Settings in Docker Desktop and check "Use WSL 2 based engine" under the General tab. It may already be checked. + * To verify, check under the Resources tab in Settings for a message saying that you are using the WSL 2 backend. Linux: + * Add your user to the docker group by using a terminal to run: `sudo usermod -aG docker $USER` + * Sign out and back in again so your changes take effect - In terminal, run: - ``` - curl -sSL https://install.python-poetry.org | python3 - - ``` - Add the following line to your .bashrc file in your home directory: - ``` - export PATH="$HOME/.local/bin:$PATH" - ``` - +3. Set up [ClearML](clear_ml_setup.md). -5. .NET Core SDK - * The necessary versions are 7.0 and 3.1. If your machine is only able to install version 7.0, you can set the DOTNET_ROLL_FORWARD environment variable to "LatestMajor", which will allow you to run anything that depends on dotnet 3.1. - * Note - the .NET SDK is needed for [SIL.Machine.Tool](https://github.com/sillsdev/machine). Many of the scripts in this repo require this .Net package. The .Net package will be installed and updated when the silnlp is initialized in `__init__.py`. - * Windows: [.NET Core SDK](https://dotnet.microsoft.com/download) - * Linux: Installation instructions can be found [here](https://learn.microsoft.com/en-us/dotnet/core/install/linux-ubuntu-2004) -6. C++ Redistributable - * Note - this may already be installed. If it is not installed you may get cryptic errors such as "System.DllNotFoundException: Unable to load DLL 'thot' or one of its dependencies" - * Windows: Download from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0 and install - * Linux: Instead of installing the redistributable, run the following commands: +4. Define environment variables. + + Set the following environment variables with your respective credentials: CLEARML_API_ACCESS_KEY, CLEARML_API_SECRET_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY. Additionally, set AWS_REGION. The typical value is "us-east-1". + * Windows users: see [here](https://github.com/sillsdev/silnlp/wiki/Install-silnlp-on-Windows-10#permanently-set-environment-variables) for instructions on setting environment variables permanently + * Linux users: To set environment variables permanently, add each variable as a new line to the `.bashrc` file in your home directory with the format ``` - sudo apt-get update - sudo apt-get install build-essential gdb + export VAR="VAL" ``` ---- -## Development Environment setup -### Option 1: PyCharm Setup -If you wish, you can use [PyCharm 2020.1](https://www.jetbrains.com/pycharm/) as your Python IDE. -First, you will need to install the Poetry plugin for PyCharm. - -1. Go to `File -> Settings -> Plugins`. -2. Search for "Poetry" and install the plugin. - -Once the Poetry plugin is installed, you can clone the the repo using PyCharm. If you have already cloned the repo, you can open the folder in PyCharm and skip these steps. - -1. Go to `VCS -> Get from Version Control...`. -2. Enter `https://github.com/sillsdev/silnlp.git` in the URL field. -3. Click the `Clone`. -4. Enter your Github credentials if necessary. - -Next, you will need to setup the interpreter for the project. - -1. Go to `File -> Settings -> Project -> Project Interpreter`. -2. Click the gear button and select `Add...`. -3. Choose `Poetry Environment` and click `OK`. -4. PyCharm will setup the Poetry environment and install all dependencies. -5. Once PyCharm finishes the setup, click `OK`. - -You will need to configure PyCharm to work properly with the project. - -1. Go to `File -> Settings -> Editor -> Inspections`. -2. In the `Profile` dropdown, select `Project Default`. -3. Uncheck the `Python -> Package requirements` setting. -4. In the `Python -> PEP 8 coding style violation` setting, ignore the errors `E402` and `E203`. - -Lastly, setup PyCharm to use the Black code formatter by following the instructions [here](https://black.readthedocs.io/en/stable/editor_integration.html#pycharm-intellij-idea). - -### Option 2: Visual Studio Code setup -1. Install Visual Studio Code -2. Install Python extension for VSCode -3. Open up silnlp folder in VSC -4. In CMD window, type `poetry install` to create the virtual environment for silnlp - * If using conda, activate your conda environment first before `poetry install`. Poetry will then install all the dependencies into the conda environment. -5. Choose the newly created virtual environment as the "Python Interpreter" in the command palette (ctrl+shift+P) - * If using conda, choose the conda environment as the interpreter -6. Open the command palette and select "Preferences: Open User Settings (JSON)". In the `settings.json` file, add the following options: - ``` json - "python.formatting.provider": "black", - "python.linting.pylintEnabled": true, - "editor.formatOnSave": true, - ``` - -## S3 bucket setup -We use Amazon S3 storage for storing our experiment data. Here is some workspace setup to enable a decent workflow. - -### Install and configure AWS S3 storage -The following will allow the boto3 and S3Path libraries in Python correctly talk to the S3 bucket. -* Install the aws-cli from: https://aws.amazon.com/cli/ -* In cmd, type: `aws configure` and enter your AWS access_key_id and secret_access_key and the region (we use region = us-east-1). -* The aws configure command will create a folder in your home directory named '.aws' it should contain two plain text files named 'config' and 'credentials'. The config file should contain the region and the credentials file should contain your access_key_id and your secret_access_key. -(Home directory on windows is usually C:\Users\\ and on linux it is /home/username) - -### Install and configure rclone - - -**Windows** - -The following will mount /aqua-ml-data on your S drive and allow you to explore, read and write. -* Install WinFsp: http://www.secfs.net/winfsp/rel/ (Click the button to "Download WinFsp Installer" not the "SSHFS-Win (x64)" installer) -* Download rclone from: https://rclone.org/downloads/ -* Unzip to your desktop (or some convient location). -* Add the folder that contains rclone.exe to your PATH environment variable. -* Take the `scripts/rclone/rclone.conf` file from this SILNLP repo and copy it to `~\AppData\Roaming\rclone` (creating folders if necessary) -* Add your credentials in the appropriate fields in `~\AppData\Roaming\rclone` -* Take the `scripts/rclone/mount_to_s.bat` file from this SILNLP repo and copy it to the folder that contains the unzipped rclone. -* Double-click the bat file. A command window should open and remain open. You should see something like: -``` -C:\Users\David\Software\rclone>call rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data S: -The service rclone has been started. -``` - -**Linux** - -The following will mount /aqua-ml-data to an S folder in your home directory and allow you to explore, read and write. -* Download rclone from: https://rclone.org/install/ -* Take the `scripts/rclone/rclone.conf` file from this SILNLP repo and copy it to `~/.config/rclone/rclone.conf` (creating folders if necessary) -* Add your credentials in the appropriate fields in `~/.config/rclone/rclone.conf` -* Create a folder called "S" in your user directory -* Run the following command: - ``` - rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data ~/S - ``` -### To start S: drive on start up +5. Install Visual Studio Code. -**Windows** +6. Clone the silnlp repo. -Put a shortcut to the mount_to_s.bat file in the Startup folder. -* In Windows Explorer put `shell:startup` in the address bar or open `C:\Users\\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Startup` -* Right click to add a new shortcut. Choose `mount_to_s.bat` as the target, you can leave the name as the default. +7. Open up silnlp folder in VS Code. -Now your AWS S3 bucket should be mounted as S: drive when you start Windows. +8. Install the Dev Containers extension for VS Code. -**Linux** -* Run `crontab -e` -* Paste `@reboot rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data ~/S` into the file, save and exit -* Reboot Linux +9. Build the dev container and open the silnlp folder in the container. + * Click on the Remote Indicator in the bottom left corner. + * Select "Reopen in Container" and choose the silnlp dev container if necessary. This will take a while the first time because the container has to build. + * If it was successful, the window will refresh and it will say "Dev Container: SILNLP" in the bottom left corner. + * Note: If you don't have a local GPU, you may need to comment out the `gpus --all` part of the `runArgs` field of the `.devcontainer/devcontainer.json` file. -Now your AWS S3 bucket should be mounted as ~/S when you start Linux. +10. Install and activate Poetry environment. + * In the VS Code terminal, run `poetry install` to install the necessary Python libraries, and then run `poetry shell` to enter the environment in the terminal. +11. (Optional) Locally mount the S3 bucket. This will allow you to interact directly with the S3 bucket from your local terminal (outside of the dev container). See instructions [here](s3_bucket_setup.md). -### Setup environment variable -The following will cause the SILNLP tools to select the S3 bucket for local silnlp operations. If you are using the Docker container, these variables will already be set and the cache will be located at `/root/.cache/silnlp`. +To get back into the dev container and poetry environment each subsequent time, open the silnlp folder in VS Code, select the "Reopen in Container" option from the Remote Connection menu (bottom left corner), and use the `poetry shell` command in the terminal. -**Windows or Linux** -* Set the environment variable SIL_NLP_DATA_PATH to "/aqua-ml-data" -* Create the directory "/home/user/.cache/silnlp", replacing "user" with your username -* Set the environment variables SIL_NLP_CACHE_EXPERIMENT_DIR and SIL_NLP_CACHE_PROJECT_DIR to "/home/user/.cache/silnlp" - ---- - -## Setup ClearML on local PC -To use Clear ML for managing experiments see the [ClearML Setup](clear_ml_windows_setup.md) - -## Additional Information for Development Environments - -### Additional Environment Variables -Set the following environment variables with your respective credentials: CLEARML_API_ACCESS_KEY, CLEARML_API_SECRET_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY -* Windows users: see [here](https://github.com/sillsdev/silnlp/wiki/Install-silnlp-on-Windows-10#permanently-set-environment-variables) for instructions on setting environment variables permanently -* Linux users: To set environment variables permanently, add each variable as a new line to the `.bashrc` file in your home directory with the format - ``` - export VAR="VAL" - ``` - -### Setting Up and Running Experiments +## Setting Up and Running Experiments See the [wiki](https://github.com/sillsdev/silnlp/wiki) for information on setting up and running experiments. The most important pages for getting started are the ones on [file structure](https://github.com/sillsdev/silnlp/wiki/Folder-structure-and-file-naming-conventions), [model configuration](https://github.com/sillsdev/silnlp/wiki/Configure-a-model), and [running experiments](https://github.com/sillsdev/silnlp/wiki/NMT:-Usage). A lot of the instructions are specific to NMT, but are still helpful starting points for doing other things like [alignment](https://github.com/sillsdev/silnlp/wiki/Alignment:-Usage). -If you are using VS Code, see [this](https://github.com/sillsdev/silnlp/wiki/Using-the-Python-Debugger) page for information on using the debugger. +See [this](https://github.com/sillsdev/silnlp/wiki/Using-the-Python-Debugger) page for information on using the VS code debugger. If you need to use a tool that is supported by SILNLP but is not installable as a Python library (which is probably the case if you get an error like "RuntimeError: eflomal is not installed."), follow the appropriate instructions [here](https://github.com/sillsdev/silnlp/wiki/Installing-External-Libraries). + +## .NET Machine alignment models + +If you need to run the .NET versions of the Machine alignment models, you will need to install .NET Core SDK 8.0. After installing, run `dotnet tool restore`. + * Windows: [.NET Core SDK](https://dotnet.microsoft.com/download) + * Linux: Installation instructions can be found [here](https://learn.microsoft.com/en-us/dotnet/core/install/linux-ubuntu-2004). \ No newline at end of file diff --git a/clear_ml_linux_setup.md b/clear_ml_setup.md similarity index 88% rename from clear_ml_linux_setup.md rename to clear_ml_setup.md index 07c9cadf..093094d2 100644 --- a/clear_ml_linux_setup.md +++ b/clear_ml_setup.md @@ -1,10 +1,7 @@ -# Instructions for setting up Clear-ML on Linux. - -These were tested on Pop!_OS. -See [Clear-ML Windows setup](clear_ml_windows_setup.md) for instructions to set up Clear-ML on Windows. +# Instructions for setting up Clear-ML. ## Install the clearml python package. -Open a terminal and use pip to install Clear-ML. +Open a terminal (or Command Prompt on Windows) and use pip to install Clear-ML. `pip install clearml` ## Add your AWS storage vault credentials (If using AWS S3). diff --git a/clear_ml_windows_setup.md b/clear_ml_windows_setup.md deleted file mode 100644 index d5c956a0..00000000 --- a/clear_ml_windows_setup.md +++ /dev/null @@ -1,46 +0,0 @@ -# Instructions for setting up Clear-ML on Windows. - -These were written and tested for use with Windows 10. -See [Clear-ML Linux setup](clear_ml_linux_setup.md) for instructions to set up Clear-ML on linux. - -## Install the clearml python package. -Open a command window and use pip to install Clear-ML. -`pip install clearml` - -## Add your AWS storage vault credentials (If using AWS S3). -1. Login to [Clear-ML](https://app.sil.hosted.allegro.ai) login with your work email address. -2. Go to the [workspace settings](https://app.sil.hosted.allegro.ai/settings/workspace-configuration). -3. At the top of the page you should see the configuration vault. If you don't see the configuration vault it is probably the case that you are not logged in to the Enterprise version of Clear-ML. - Add your aws key and secret and the region to the configuration vault using this format: -``` -sdk { - aws { - s3 { - key: "xxxxxxxxxxxxxxxxxx" - secret: "xxxxxxxxxxxxxxxxxx" - region: "us-east-1" - } - } -} -``` - -## Create your Clear-ML credentials -1. In a command window enter `clearml-init` you should be prompted to `Paste copied configuration here:` -2. On the [workspace settings](https://app.sil.hosted.allegro.ai/settings/workspace-configuration) webpage click `Create new credentials`. - -They'll look something like this: -``` -api { - web_server: https://app.sil.hosted.allegro.ai - api_server: https://api.sil.hosted.allegro.ai - files_server: https://files.sil.hosted.allegro.ai - credentials { - "access_key" = "xxxxxxxxxxxxxxxxxx" - "secret_key" = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" - } -} -``` -3. Use the button to copy the new credentials to your clipboard. -4. Paste them into the command window. -5. This will create a clearml.conf file in your home directory i.e. C:\Users\\clearml.conf -6. If this file already exists the `clearml-init` command will invite you to edit it. You may find it easier to delete it and run through these instructions, or you can put the copied details into the existing file in the required format. diff --git a/manual_setup.md b/manual_setup.md new file mode 100644 index 00000000..df7d03bc --- /dev/null +++ b/manual_setup.md @@ -0,0 +1,113 @@ +# Manual Setup + +## SILNLP Prerequisites +These are the main requirements for the SILNLP code to run on a local machine. Since there are many Python packages that need to be used with complex versioning requirements, we use a Python package called Poetry to mangage all of those. So here is a rough heirarchy of SILNLP with the major dependencies. + +| Requirement | Reason | +| --------------------- | ----------------------------------------------------------------- | +| GIT | to get the repo from [github](https://github.com/sillsdev/silnlp) | +| Python | to run the silnlp code | +| Poetry | to manage all the Python packages and versions | +| NVIDIA GPU | Required to run on a local machine | +| Nvidia drivers | Required for the GPU | +| CUDA Toolkit | Required for the Machine learning with the GPU | +| Environment variables | To tell SILNLP where to find the data, etc. | + +## Setup + +The SILNLP code can be run on either Windows or Linux operating systems. If using an Ubuntu distribution, the only compatible version is 20.04. + +__Download and install__ the following before creating any projects or starting any code, preferably in this order to avoid most warnings: + +1. If using a local GPU: [NVIDIA driver](https://www.nvidia.com/download/index.aspx) + * On Ubuntu, the driver can alternatively be installed through the GUI by opening Software & Updates, navigating to Additional Drivers in the top menu, and selecting the newest NVIDIA driver with the labels proprietary and tested. + * After installing the driver, reboot your system. +2. [Git](https://git-scm.com/downloads) +3. [Python 3.8](https://www.python.org/downloads/) (latest minor version, ie 3.8.19) + * Can alternatively install Python using [miniconda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.html) if you're planning to use more than one version of Python. If following this method, activate your conda environment before installing Poetry. +4. [Poetry](https://python-poetry.org/docs/#installation) + * Note that whether the command should call python or python3 depends on which is required on your machine. + * It may (or may not) be possible to run the curl command within a VS Code terminal. If that causes permission errors close VS Code and try it in an elevated CMD prompt. + + Windows: + At an administrator CMD prompt or a terminal within VS Code run: + ``` + curl -sSL https://install.python-poetry.org | python - --version 1.7.1 + ``` + In Powershell, run: + ``` + (Invoke-WebRequest -Uri https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py -UseBasicParsing).Content | python + ``` + + Linux: + In terminal, run: + ``` + curl -sSL https://install.python-poetry.org | python3 - --version 1.7.1 + ``` + Add the following line to your .bashrc file in your home directory: + ``` + export PATH="$HOME/.local/bin:$PATH" + ``` +5. C++ Redistributable + * Note - this may already be installed. If it is not installed you may get cryptic errors such as "System.DllNotFoundException: Unable to load DLL 'thot' or one of its dependencies" + * Windows: Download from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0 and install + * Linux: Instead of installing the redistributable, run the following commands: + ``` + sudo apt-get update + sudo apt-get install build-essential gdb + ``` + +### Visual Studio Code setup + +1. Install Visual Studio Code +2. Install Python extension for VS Code +3. Open up silnlp folder in VSC +4. In CMD window, type `poetry install` to create the virtual environment for silnlp + * If using conda, activate your conda environment first before `poetry install`. Poetry will then install all the dependencies into the conda environment. +5. Choose the newly created virtual environment as the "Python Interpreter" in the command palette (ctrl+shift+P) + * If using conda, choose the conda environment as the interpreter +6. Open the command palette and select "Preferences: Open User Settings (JSON)". In the `settings.json` file, add the following options: + ``` json + "python.formatting.provider": "black", + "python.linting.pylintEnabled": true, + "editor.formatOnSave": true, + ``` + +### S3 bucket setup + +See [S3 bucket setup](s3_bucket_setup.md). + +### ClearML setup + +See [ClearML setup](clear_ml_setup.md). + +### Create SILNLP cache +* Create the directory "/home/user/.cache/silnlp", replacing "user" with your username. +* Create the directory "/home/user/.cache/silnlp/experiments" and set the environment variable SIL_NLP_CACHE_EXPERIMENT_DIR to that path. +* Create the directory "/home/user/.cache/silnlp/projects" and set the environment variable SIL_NLP_CACHE_PROJECT_DIR to that path. + +### Additional Environment Variables +* Set the following environment variables with your respective credentials: CLEARML_API_ACCESS_KEY, CLEARML_API_SECRET_KEY, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY. +* Set SIL_NLP_DATA_PATH to "/aqua-ml-data" and CLEARML_API_HOST to "https://api.sil.hosted.allegro.ai". + +### Setting Up and Running Experiments + +See the [wiki](https://github.com/sillsdev/silnlp/wiki) for information on setting up and running experiments. The most important pages for getting started are the ones on [file structure](https://github.com/sillsdev/silnlp/wiki/Folder-structure-and-file-naming-conventions), [model configuration](https://github.com/sillsdev/silnlp/wiki/Configure-a-model), and [running experiments](https://github.com/sillsdev/silnlp/wiki/NMT:-Usage). A lot of the instructions are specific to NMT, but are still helpful starting points for doing other things like [alignment](https://github.com/sillsdev/silnlp/wiki/Alignment:-Usage). + +See [this](https://github.com/sillsdev/silnlp/wiki/Using-the-Python-Debugger) page for information on using the VS code debugger. + +If you need to use a tool that is supported by SILNLP but is not installable as a Python library (which is probably the case if you get an error like "RuntimeError: eflomal is not installed."), follow the appropriate instructions [here](https://github.com/sillsdev/silnlp/wiki/Installing-External-Libraries). + +## Setting environment variables permanently +Windows users: see [here](https://github.com/sillsdev/silnlp/wiki/Install-silnlp-on-Windows-10#permanently-set-environment-variables) for instructions on setting environment variables permanently + +Linux users: To set environment variables permanently, add each variable as a new line to the `.bashrc` file in your home directory with the format + ``` + export VAR="VAL" + ``` + +## .NET Machine alignment models + +If you need to run the .NET versions of the Machine alignment models, you will need to install .NET Core SDK 8.0. After installing, run `dotnet tool restore`. + * Windows: [.NET Core SDK](https://dotnet.microsoft.com/download) + * Linux: Installation instructions can be found [here](https://learn.microsoft.com/en-us/dotnet/core/install/linux-ubuntu-2004). \ No newline at end of file diff --git a/s3_bucket_setup.md b/s3_bucket_setup.md new file mode 100644 index 00000000..0ef8dcf1 --- /dev/null +++ b/s3_bucket_setup.md @@ -0,0 +1,55 @@ +# S3 bucket setup + +We use Amazon S3 storage for storing our experiment data. Here is some workspace setup to enable a decent workflow. + +### Install and configure AWS S3 storage +* Install the aws-cli from: https://aws.amazon.com/cli/ +* In cmd, type: `aws configure` and enter your AWS access_key_id and secret_access_key and the region (we use region = us-east-1). +* The aws configure command will create a folder in your home directory named '.aws' it should contain two plain text files named 'config' and 'credentials'. The config file should contain the region and the credentials file should contain your access_key_id and your secret_access_key. +(Home directory on windows is usually C:\Users\\ and on linux it is /home/username) + +### Install and configure rclone + +**Windows** + +The following will mount /aqua-ml-data on your S drive and allow you to explore, read and write. +* Install WinFsp: http://www.secfs.net/winfsp/rel/ (Click the button to "Download WinFsp Installer" not the "SSHFS-Win (x64)" installer) +* Download rclone from: https://rclone.org/downloads/ +* Unzip to your desktop (or some convient location). +* Add the folder that contains rclone.exe to your PATH environment variable. +* Take the `scripts/rclone/rclone.conf` file from this SILNLP repo and copy it to `~\AppData\Roaming\rclone` (creating folders if necessary) +* Add your credentials in the appropriate fields in `~\AppData\Roaming\rclone` +* Take the `scripts/rclone/mount_to_s.bat` file from this SILNLP repo and copy it to the folder that contains the unzipped rclone. +* Double-click the bat file. A command window should open and remain open. You should see something like: +``` +C:\Users\David\Software\rclone>call rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data S: +The service rclone has been started. +``` + +**Linux** + +The following will mount /aqua-ml-data to an S folder in your home directory and allow you to explore, read and write. +* Download rclone from: https://rclone.org/install/ +* Take the `scripts/rclone/rclone.conf` file from this SILNLP repo and copy it to `~/.config/rclone/rclone.conf` (creating folders if necessary) +* Add your credentials in the appropriate fields in `~/.config/rclone/rclone.conf` +* Create a folder called "S" in your user directory +* Run the following command: + ``` + rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data ~/S + ``` +### To start S: drive on start up + +**Windows** + +Put a shortcut to the mount_to_s.bat file in the Startup folder. +* In Windows Explorer put `shell:startup` in the address bar or open `C:\Users\\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Startup` +* Right click to add a new shortcut. Choose `mount_to_s.bat` as the target, you can leave the name as the default. + +Now your AWS S3 bucket should be mounted as S: drive when you start Windows. + +**Linux** +* Run `crontab -e` +* Paste `@reboot rclone mount --vfs-cache-mode full --use-server-modtime s3aqua:aqua-ml-data ~/S` into the file, save and exit +* Reboot Linux + +Now your AWS S3 bucket should be mounted as ~/S when you start Linux. \ No newline at end of file