From 2dd95681b277707630731d8e468d1e72235b3576 Mon Sep 17 00:00:00 2001
From: Rui Campos <ruifilipedesousacampos@gmail.com>
Date: Wed, 3 Jan 2024 17:50:26 +0000
Subject: [PATCH] fixes

---
 .github/workflows/gpt_shakespear_experiment.yml |  8 +-------
 gpt_shakespear/train_worker.py                  |  3 ++-
 gpt_shakespear/user_data.sh                     | 16 ++++++----------
 mtn_shakespeare/model/self_attention.py         | 13 ++++++++++---
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/gpt_shakespear_experiment.yml b/.github/workflows/gpt_shakespear_experiment.yml
index a0ae11ff..e8953cc1 100644
--- a/.github/workflows/gpt_shakespear_experiment.yml
+++ b/.github/workflows/gpt_shakespear_experiment.yml
@@ -24,16 +24,11 @@ on:
         required: true
         type: string
         default: '11'
-      tokens:
-        description: 'The number of tokens in the vocab.'
-        required: true
-        type: string
-        default: '3'
       bias:
         description: 'Whether to use bias in the linear layers.'
         required: true
         type: string
-        default: 'True'
+        default: 'False'
       batch_size:
         description: 'The batch size.'
         required: true
@@ -90,7 +85,6 @@ jobs:
         export INSTANCE_TYPE=${{ github.event.inputs.instance_type }}
 
         export COORDINATES=${{ github.event.inputs.coordinates }}
-        export TOKENS=${{ github.event.inputs.tokens }}
         export WORDS=${{ github.event.inputs.words }}
         export NUMBER_OF_BLOCKS=${{ github.event.inputs.number_of_blocks }}
         export NUMBER_OF_HEADS=${{ github.event.inputs.number_of_heads }}
diff --git a/gpt_shakespear/train_worker.py b/gpt_shakespear/train_worker.py
index 4a83bcb4..96ddf849 100644
--- a/gpt_shakespear/train_worker.py
+++ b/gpt_shakespear/train_worker.py
@@ -15,6 +15,7 @@
 from typing import Literal, Iterator
 import tiktoken
 import numpy as np
+from typing import Optional
 
 
 
@@ -164,7 +165,7 @@ def generate_epoch() -> Iterator[tuple[torch.Tensor, torch.Tensor]]:
         raise ValueError(f"Unknown loss function {train_params.loss_function}")
 
     # ----------------- LOAD MODEL ----------------- #
-    last_epoch: float | None = mlflow.get_run(run.info.run_id).data.metrics.get('epoch', None)
+    last_epoch: Optional[float] = mlflow.get_run(run.info.run_id).data.metrics.get('epoch', None)
     if last_epoch is not None:
         last_epoch = int(last_epoch)
         logger.debug("Last epoch is %s", last_epoch)
diff --git a/gpt_shakespear/user_data.sh b/gpt_shakespear/user_data.sh
index 949a5dde..5627bef6 100644
--- a/gpt_shakespear/user_data.sh
+++ b/gpt_shakespear/user_data.sh
@@ -3,8 +3,6 @@
 
 export AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}
 export AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}
-sudo yum install amazon-cloudwatch-agent -y
-
 
 
 cat << 'EOF' > send_logs_to_cloudwatch.sh
@@ -55,13 +53,13 @@ export MLFLOW_TRACKING_PASSWORD={MLFLOW_TRACKING_PASSWORD}
 sudo mkdir /larger_tmp
 export TMPDIR=/larger_tmp
 
-sudo fallocate -l 30G /swapfile
-sudo chmod 600 /swapfile
-sudo mkswap /swapfile1
-sudo swapon /swapfile
+# sudo fallocate -l 30G /swapfile
+# sudo chmod 600 /swapfile
+# sudo mkswap /swapfile
+# sudo swapon /swapfile
 
 sudo yum update -y 
-sudo yum install -y git  
+sudo yum install -y git 
 sudo yum install -y python
 sudo yum install -y python3-pip
 
@@ -69,13 +67,11 @@ git clone https://github.com/Digital-Defiance/llm-voice-chat.git
 cd llm-voice-chat
 git checkout {current_commit}
 
-
-
 python -m venv env
 source env/bin/activate
 pip install -r .devcontainer/requirements.txt
 cd gpt_shakespear
 python train_worker.py
 #wait two minutes before shutting down, so that the logs can be sent to cloudwatch
-shutdown -h +2
+shutdown -h +1
 
diff --git a/mtn_shakespeare/model/self_attention.py b/mtn_shakespeare/model/self_attention.py
index a66cd117..53aeaeff 100644
--- a/mtn_shakespeare/model/self_attention.py
+++ b/mtn_shakespeare/model/self_attention.py
@@ -13,6 +13,11 @@ class SelfAttentionParameters(Protocol):
     words: int
     number_of_heads: int
 
+
+
+
+
+
 class SelfAttention(nn.Module):
     attention_heads_dc: nn.Linear
     projection_cc: nn.Linear
@@ -28,9 +33,11 @@ def __init__(self, params: SelfAttentionParameters):
 
         self.COORDINATES = params.coordinates
         self.NUMBER_OF_HEADS = params.number_of_heads
-        # d = 3*coordinates
-        dimension = 3 * params.coordinates
-    
+
+        dimension = 2 * params.coordinates
+
+
+
         self.attention_heads_dc = nn.Linear(
             params.coordinates,
             dimension,