diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44b34da --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Trunk linter +.trunk/out +.trunk/external +.trunk/logs +.trunk/actions +.trunk/notifications + +# do not include coverage file +pytest-coverage.txt + +#vs-code config +.vscode + +# Mac os index files +.DS_Store +notebooks/.vector_cache/wiki.en.vec +notebooks/.vector_cache/wiki.en.vec.pt +notebooks/~/ +~/ +__MACOSX/ +.idea/ + +# Compressed files +*.mat +*.zip +*.rar + +# Datasets +torch_datasets/ +data/ +*.csv +*.xls + +# Experimental files +notebooks/.vector_cache/* +notebooks/my_awesome_qa_model/runs* \ No newline at end of file diff --git a/flexnlp/__init__.py b/flexnlp/__init__.py new file mode 100644 index 0000000..39bcfa1 --- /dev/null +++ b/flexnlp/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from flexnlp import utils + +__version__ = "0.0.1" diff --git a/flexnlp/notebooks/Centralized_QA.ipynb b/flexnlp/notebooks/Centralized_QA.ipynb new file mode 100644 index 0000000..e74aa54 --- /dev/null +++ b/flexnlp/notebooks/Centralized_QA.ipynb @@ -0,0 +1,479 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer\n", + "from transformers import DefaultDataCollator\n", + "from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer\n", + "import collections\n", + "import numpy as np\n", + "import evaluate\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load SQuAD dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset squad (C:/Users/Cris/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['id', 'title', 'context', 'question', 'answers'],\n", + " num_rows: 700\n", + " })\n", + " test: Dataset({\n", + " features: ['id', 'title', 'context', 'question', 'answers'],\n", + " num_rows: 176\n", + " })\n", + "})\n" + ] + } + ], + "source": [ + "# Load a percentage of squal\n", + "squad = load_dataset(\"squad\", split=\"train[:1%]\")\n", + "# Split 80% train, 20% test\n", + "squad = squad.train_test_split(test_size=0.2)\n", + "print(squad )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocess" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "model_checkpoint = \"distilbert-base-uncased\"\n", + "#model_checkpoint = \"bert-base-cased\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "(700, 728)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_length = 384\n", + "stride = 128\n", + "\n", + "\n", + "def preprocess_training_examples(examples):\n", + " questions = [q.strip() for q in examples[\"question\"]]\n", + " inputs = tokenizer(\n", + " questions,\n", + " examples[\"context\"],\n", + " max_length=max_length,\n", + " truncation=\"only_second\",\n", + " stride=stride,\n", + " return_overflowing_tokens=True,\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " offset_mapping = inputs.pop(\"offset_mapping\")\n", + " sample_map = inputs.pop(\"overflow_to_sample_mapping\")\n", + " answers = examples[\"answers\"]\n", + " start_positions = []\n", + " end_positions = []\n", + "\n", + " for i, offset in enumerate(offset_mapping):\n", + " sample_idx = sample_map[i]\n", + " answer = answers[sample_idx]\n", + " start_char = answer[\"answer_start\"][0]\n", + " end_char = answer[\"answer_start\"][0] + len(answer[\"text\"][0])\n", + " sequence_ids = inputs.sequence_ids(i)\n", + "\n", + " # Find the start and end of the context\n", + " idx = 0\n", + " while sequence_ids[idx] != 1:\n", + " idx += 1\n", + " context_start = idx\n", + " while sequence_ids[idx] == 1:\n", + " idx += 1\n", + " context_end = idx - 1\n", + "\n", + " # If the answer is not fully inside the context, label is (0, 0)\n", + " if offset[context_start][0] > start_char or offset[context_end][1] < end_char:\n", + " start_positions.append(0)\n", + " end_positions.append(0)\n", + " else:\n", + " # Otherwise it's the start and end token positions\n", + " idx = context_start\n", + " while idx <= context_end and offset[idx][0] <= start_char:\n", + " idx += 1\n", + " start_positions.append(idx - 1)\n", + "\n", + " idx = context_end\n", + " while idx >= context_start and offset[idx][1] >= end_char:\n", + " idx -= 1\n", + " end_positions.append(idx + 1)\n", + "\n", + " inputs[\"start_positions\"] = start_positions\n", + " inputs[\"end_positions\"] = end_positions\n", + " return inputs\n", + "\n", + "train_dataset = squad[\"train\"].map(\n", + " preprocess_training_examples,\n", + " batched=True,\n", + " remove_columns=squad[\"train\"].column_names,\n", + ")\n", + "len(squad[\"train\"]), len(train_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "(176, 180)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def preprocess_validation_examples(examples):\n", + " questions = [q.strip() for q in examples[\"question\"]]\n", + " inputs = tokenizer(\n", + " questions,\n", + " examples[\"context\"],\n", + " max_length=max_length,\n", + " truncation=\"only_second\",\n", + " stride=stride,\n", + " return_overflowing_tokens=True,\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " sample_map = inputs.pop(\"overflow_to_sample_mapping\")\n", + " example_ids = []\n", + "\n", + " for i in range(len(inputs[\"input_ids\"])):\n", + " sample_idx = sample_map[i]\n", + " example_ids.append(examples[\"id\"][sample_idx])\n", + "\n", + " sequence_ids = inputs.sequence_ids(i)\n", + " offset = inputs[\"offset_mapping\"][i]\n", + " inputs[\"offset_mapping\"][i] = [\n", + " o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)\n", + " ]\n", + "\n", + " inputs[\"example_id\"] = example_ids\n", + " return inputs\n", + "\n", + "validation_dataset = squad[\"test\"].map(\n", + " preprocess_validation_examples,\n", + " batched=True,\n", + " remove_columns=squad[\"test\"].column_names,\n", + ")\n", + "len(squad[\"test\"]), len(validation_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Training" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "model = AutoModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/138 [00:00 max_answer_length\n", + " if (\n", + " end_index < start_index\n", + " or end_index - start_index + 1 > max_answer_length\n", + " ):\n", + " continue\n", + "\n", + " answer = {\n", + " \"text\": context[offsets[start_index][0] : offsets[end_index][1]],\n", + " \"logit_score\": start_logit[start_index] + end_logit[end_index],\n", + " }\n", + " answers.append(answer)\n", + "\n", + " # Select the answer with the best score\n", + " if len(answers) > 0:\n", + " best_answer = max(answers, key=lambda x: x[\"logit_score\"])\n", + " predicted_answers.append(\n", + " {\"id\": example_id, \"prediction_text\": best_answer[\"text\"]}\n", + " )\n", + " else:\n", + " predicted_answers.append({\"id\": example_id, \"prediction_text\": \"\"})\n", + "\n", + " theoretical_answers = [{\"id\": ex[\"id\"], \"answers\": ex[\"answers\"]} for ex in examples]\n", + " return metric.compute(predictions=predicted_answers, references=theoretical_answers)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 12/12 [00:58<00:00, 4.88s/it]\n" + ] + } + ], + "source": [ + "predictions, _, _ = trainer.predict(validation_dataset)\n", + "start_logits, end_logits = predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "180\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 176/176 [00:00<00:00, 573.78it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'exact_match': 10.227272727272727, 'f1': 13.03407354677408}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(len(predictions[0]))\n", + "compute_metrics(start_logits, end_logits, validation_dataset, squad[\"test\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "flexible", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/flexnlp/notebooks/Centralized_QA.py b/flexnlp/notebooks/Centralized_QA.py new file mode 100644 index 0000000..14c82fa --- /dev/null +++ b/flexnlp/notebooks/Centralized_QA.py @@ -0,0 +1,216 @@ +import torch +from datasets import load_dataset +from transformers import AutoTokenizer +from transformers import DefaultDataCollator +from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer +import collections +import numpy as np +import evaluate + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Load a percentage of squal +squad = load_dataset("squad", split="train") +# Split 80% train, 20% test +squad = squad.train_test_split(test_size=0.2) +print(squad ) + +model_checkpoint = "distilbert-base-uncased" +#model_checkpoint = "bert-base-cased" +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) + +max_length = 384 +stride = 128 + + +def preprocess_training_examples(examples): + questions = [q.strip() for q in examples["question"]] + inputs = tokenizer( + questions, + examples["context"], + max_length=max_length, + truncation="only_second", + stride=stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + offset_mapping = inputs.pop("offset_mapping") + sample_map = inputs.pop("overflow_to_sample_mapping") + answers = examples["answers"] + start_positions = [] + end_positions = [] + + for i, offset in enumerate(offset_mapping): + sample_idx = sample_map[i] + answer = answers[sample_idx] + start_char = answer["answer_start"][0] + end_char = answer["answer_start"][0] + len(answer["text"][0]) + sequence_ids = inputs.sequence_ids(i) + + # Find the start and end of the context + idx = 0 + while sequence_ids[idx] != 1: + idx += 1 + context_start = idx + while sequence_ids[idx] == 1: + idx += 1 + context_end = idx - 1 + + # If the answer is not fully inside the context, label is (0, 0) + if offset[context_start][0] > start_char or offset[context_end][1] < end_char: + start_positions.append(0) + end_positions.append(0) + else: + # Otherwise it's the start and end token positions + idx = context_start + while idx <= context_end and offset[idx][0] <= start_char: + idx += 1 + start_positions.append(idx - 1) + + idx = context_end + while idx >= context_start and offset[idx][1] >= end_char: + idx -= 1 + end_positions.append(idx + 1) + + inputs["start_positions"] = start_positions + inputs["end_positions"] = end_positions + return inputs + +train_dataset = squad["train"].map( + preprocess_training_examples, + batched=True, + remove_columns=squad["train"].column_names, +) + +print("Len train y test") +print(len(squad["train"]), len(train_dataset)) + +def preprocess_validation_examples(examples): + questions = [q.strip() for q in examples["question"]] + inputs = tokenizer( + questions, + examples["context"], + max_length=max_length, + truncation="only_second", + stride=stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + sample_map = inputs.pop("overflow_to_sample_mapping") + example_ids = [] + + for i in range(len(inputs["input_ids"])): + sample_idx = sample_map[i] + example_ids.append(examples["id"][sample_idx]) + + sequence_ids = inputs.sequence_ids(i) + offset = inputs["offset_mapping"][i] + inputs["offset_mapping"][i] = [ + o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) + ] + + inputs["example_id"] = example_ids + return inputs + +validation_dataset = squad["test"].map( + preprocess_validation_examples, + batched=True, + remove_columns=squad["test"].column_names, +) + +print("Len val y test") +len(squad["test"]), len(validation_dataset) + +# Training +model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") + +training_args = TrainingArguments( + output_dir="my_awesome_qa_model", + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=3, + weight_decay=0.01, + use_cpu=False, + torch_compile=True, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=validation_dataset, + tokenizer=tokenizer, + # data_collator=data_collator, +) + +output = trainer.train() + +from tqdm.auto import tqdm + +n_best = 20 +max_answer_length = 30 +predicted_answers = [] +metric = evaluate.load("squad") + +def compute_metrics(start_logits, end_logits, features, examples): + example_to_features = collections.defaultdict(list) + for idx, feature in enumerate(features): + example_to_features[feature["example_id"]].append(idx) + + predicted_answers = [] + for example in tqdm(examples): + example_id = example["id"] + context = example["context"] + answers = [] + + # Loop through all features associated with that example + for feature_index in example_to_features[example_id]: + start_logit = start_logits[feature_index] + end_logit = end_logits[feature_index] + offsets = features[feature_index]["offset_mapping"] + + start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() + end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Skip answers that are not fully in the context + if offsets[start_index] is None or offsets[end_index] is None: + continue + # Skip answers with a length that is either < 0 or > max_answer_length + if ( + end_index < start_index + or end_index - start_index + 1 > max_answer_length + ): + continue + + answer = { + "text": context[offsets[start_index][0] : offsets[end_index][1]], + "logit_score": start_logit[start_index] + end_logit[end_index], + } + answers.append(answer) + + # Select the answer with the best score + if len(answers) > 0: + best_answer = max(answers, key=lambda x: x["logit_score"]) + predicted_answers.append( + {"id": example_id, "prediction_text": best_answer["text"]} + ) + else: + predicted_answers.append({"id": example_id, "prediction_text": ""}) + + theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples] + return metric.compute(predictions=predicted_answers, references=theoretical_answers) + +predictions, _, _ = trainer.predict(validation_dataset) +start_logits, end_logits = predictions + +print(len(predictions[0])) +compute_metrics(start_logits, end_logits, validation_dataset, squad["test"]) + diff --git a/flexnlp/notebooks/Federated IMDb PT using FLExible with a GRU.ipynb b/flexnlp/notebooks/Federated IMDb PT using FLExible with a GRU.ipynb new file mode 100644 index 0000000..119aaa3 --- /dev/null +++ b/flexnlp/notebooks/Federated IMDb PT using FLExible with a GRU.ipynb @@ -0,0 +1,691 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "import numpy as np\n", + "\n", + "from datasets.load import load_dataset\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.data import Dataset as TorchDataset\n", + "import torchtext\n", + "\n", + "from flexnlp.utils.collators import ClassificationCollator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "device = (\n", + " \"cuda\"\n", + " if torch.cuda.is_available()\n", + " else \"mps\"\n", + " if torch.backends.mps.is_available()\n", + " else \"cpu\"\n", + ")\n", + "\n", + "print(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# imdb_dataset = load_dataset('imdb', split=['train', 'test']) # Get the dataset from huggingface library\n", + "train_dataset, test_dataset = torchtext.datasets.AG_NEWS() # Get the dataset from torchtext library\n", + "unique_classes = set([label for (label, text) in train_dataset])\n", + "num_classes = len(unique_classes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preparativos como los embeddings, el vocabulario, etc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torchtext.data.utils import get_tokenizer\n", + "from torchtext.vocab import GloVe, FastText, vocab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_dim = 50 # Dimension of the embeddings\n", + "glove = GloVe(name='6B', dim=embeddings_dim) # Load GloVe embeddings with 100 dimensions.\n", + "# fasttext = FastText(language='en') # To use FastText instead of GloVe\n", + "vocabulary = vocab(glove.stoi)\n", + "# vocabulary_fasttext = vocab(fasttext.stoi) # To use FastText instead of GloVe\n", + "vocab_size = len(vocabulary) # Get the vocabulary size\n", + "print(f\"Total vocabulary size: {vocab_size}\")\n", + "print(f\"Shape of embeddings: {glove.vectors.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "example = \"This is an example sentence to test the tokenizer.\"\n", + "tokenizer = get_tokenizer(\"basic_english\")\n", + "spacy_tokenizer = get_tokenizer(\"spacy\", language=\"en_core_web_sm\")\n", + "example_tokens = tokenizer(example)\n", + "example_tokens_spacy = spacy_tokenizer(example)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocabulary.get_itos()[:10] # Get the first 10 words of the vocabulary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Padding token idx, pad: {vocabulary.get_itos()[0]}\") # Get the index of the word '' for padding\n", + "print(f\"Padding token idx, pad: {vocabulary.get_itos()[0:10]}\") # Get the index of the word '' for padding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pad_token = \"\"\n", + "pad_index = 0\n", + "vocabulary.insert_token(pad_token, pad_index)\n", + "vocabulary.set_default_index(pad_index)\n", + "# glove.vectors = torch.cat(1, (torch.zeros(1, embeddings_dim), glove.vectors))\n", + "pretrained_embeddings = glove.vectors\n", + "print(f\"Len pretrained embeddings: {len(pretrained_embeddings)}\")\n", + "pretrained_embeddings = torch.cat((torch.zeros(1,pretrained_embeddings.shape[1]),pretrained_embeddings))\n", + "print(f\"Len pretrained embeddings: {len(pretrained_embeddings)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Padding token idx, pad: {vocabulary.get_itos()[0:10]}\") # Get the index of the word '' for padding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the basic english tokenizer from PyTorch, or the SpaCy tokenizer if we have spacy downloaded. Here we probe both tokenizer with the same example sentence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Basic English Tokenizer: {example_tokens}\")\n", + "print(f\"Spacy Tokenizer: {example_tokens_spacy}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Client's will probably want to delete the stopwords, optional, as the embeddings may have vectors for most of the stopwords. Here we show multiple options show the user must decide what he prefers to use. In this notebook we're going to use the first case, as it will have most information. In other case, we would use the last one, so at least we keep the most information we can. \n", + "\n", + "Later we will have to tokenize the clients data, and then we will add the padding to the sequences, and will convert the token to the index of the embedding matrix (ids)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove stopwords\n", + "from nltk.corpus import stopwords\n", + "stop_words = set(stopwords.words('english'))\n", + "\n", + "print(f\"Example tokens tokenized: {[word.lower() for word in example_tokens_spacy]}\")\n", + "\n", + "print(f\"Example tokens without stopwords: {[word.lower() for word in example_tokens_spacy if word not in stop_words]}\")\n", + "\n", + "print(f\"Example tokens without stopwords and word in vocabulary: {[word.lower() for word in example_tokens_spacy if word not in stop_words and word.lower() in vocabulary]}\")\n", + "\n", + "print(f\"Example tokens without quitting stopwords and word in vocabulary: {[word.lower() for word in example_tokens_spacy if word.lower() in vocabulary]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# From centralized data to federated data\n", + "\n", + "First we're going to federate the dataset using the FedDataDristibution class, that has functions to load multiple datasets from deep learning libraries such as PyTorch or TensorFlow. In this notebook we are using PyTorch, so we need to use the functions from the PyTorch ecosystem, and for the text datasets, we need to use the function `from_config_with_torchtext_dataset`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.data import FedDatasetConfig, FedDataDistribution\n", + "\n", + "config = FedDatasetConfig(seed=0)\n", + "config.n_clients = 2\n", + "config.replacement = False # ensure that clients do not share any data\n", + "config.client_names = ['client1', 'client2'] # Optional\n", + "flex_dataset = FedDataDistribution.from_config_with_torchtext_dataset(data=train_dataset, config=config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We may also want to use the FLEXible dataset for the test data, so we just use da function `from_torchtext_dataset` in the Dataset class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.data import Dataset\n", + "\n", + "test_dataset = Dataset.from_torchtext_dataset(test_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2) Federate a model with FLEXible.\n", + "\n", + "Once we've federated the dataset, it's time to create the FlexPool. The FlexPool class is the one that simulates the real-time scenario for federated learning, so it is in charge of the communications across actors. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.model import FlexModel\n", + "from flex.pool import FlexPool\n", + "\n", + "from flex.pool.decorators import init_server_model\n", + "from flex.pool.decorators import deploy_server_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook we are going to simulate a client-server architecture, which we can easily build using the FlexPool class, using the function `client_server_architecture`. This function needs a FlexDataset, which we already have prepared, and a function to initialize the server model, which we have to create.\n", + "\n", + "The model we are going to use is a simple LSTM, which will have the embeddings, the LSTM, a Linear layer and the output layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class GruNet(nn.Module):\n", + " def __init__(self, embeddings, hidden_size, num_classes):\n", + " super().__init__()\n", + " # Initialize the Embedding Layer with the GloVe embeddings.\n", + " self.emb = nn.Embedding.from_pretrained(embeddings,\n", + " freeze=True,\n", + " padding_idx=0\n", + " )\n", + " # Take the embeddings size from the embeddings vector.\n", + " self.embedding_size = embeddings.shape[1]\n", + " #Create the GRU layer with just one layer.\n", + " self.gru = nn.GRU(self.embedding_size,\n", + " hidden_size,\n", + " batch_first=True,\n", + " num_layers=1\n", + " )\n", + " # Create the prediction layer.\n", + " self.fc = nn.Linear(hidden_size, num_classes)\n", + "\n", + " def forward(self, x):\n", + " # x.shape = [batch_size, len]\n", + " x = self.emb(x)\n", + " # x.shape = [batch_size, len, emb_dim]\n", + " _, x = self.gru(x)\n", + " # x.shape = [1, batch_size, hid_dim]\n", + " x = self.fc(x)\n", + " return x\n", + "\n", + "\n", + "@init_server_model\n", + "def build_server_model():\n", + " server_flex_model = FlexModel()\n", + "\n", + " server_flex_model['model'] = GruNet(embeddings=pretrained_embeddings, hidden_size=128,\n", + " num_classes=num_classes)\n", + " # Required to store this for later stages of the FL training process\n", + " server_flex_model[\"criterion\"] = torch.nn.CrossEntropyLoss()\n", + " server_flex_model[\"optimizer_func\"] = torch.optim.SGD\n", + " server_flex_model[\"optimizer_kwargs\"] = {}\n", + "\n", + " return server_flex_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we've defined the function to initialize the server model, we can create the FlexPool using the function `client_server_architecture`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "flex_pool = FlexPool.client_server_pool(fed_dataset=flex_dataset, init_func=build_server_model)\n", + "\n", + "clients = flex_pool.clients\n", + "servers = flex_pool.servers\n", + "aggregators = flex_pool.aggregators\n", + "\n", + "print(f\"Number of nodes in the pool {len(flex_pool)}: {len(servers)} server plus {len(clients)} clients. The server is also an aggregator\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the decorator `deploy_server_model` to create a custom function that deploys our server model, or we can use the primitive `deploy_server_model_pt` to deploy the server model to the clients." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.pool import deploy_server_model, deploy_server_model_pt\n", + "\n", + "@deploy_server_model\n", + "def copy_server_model_to_clients(server_flex_model: FlexModel):\n", + " return deepcopy(server_flex_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "servers.map(copy_server_model_to_clients, clients) # Using the function created with the decorator\n", + "# servers.map(deploy_server_model_pt, clients) # Using the primitive function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As text needs to be preprocessed and batched on the clients, we can do it on the train function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import random\n", + "\n", + "from tqdm import tqdm\n", + "\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "\n", + "BATCH_SIZE = 256\n", + "NUM_EPOCHS = 10\n", + "\n", + "def clean_str(string):\n", + " \"\"\"\n", + " Tokenization/string cleaning.\n", + " Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py\n", + " \"\"\"\n", + " string = re.sub(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \", string)\n", + " string = re.sub(r\"\\'s\", \" \\'s\", string)\n", + " string = re.sub(r\"\\'ve\", \" \\'ve\", string)\n", + " string = re.sub(r\"n\\'t\", \" n\\'t\", string)\n", + " string = re.sub(r\"\\'re\", \" \\'re\", string)\n", + " string = re.sub(r\"\\'d\", \" \\'d\", string)\n", + " string = re.sub(r\"\\'ll\", \" \\'ll\", string)\n", + " string = re.sub(r\",\", \" , \", string)\n", + " string = re.sub(r\"!\", \" ! \", string)\n", + " string = re.sub(r\"\\(\", \" \\( \", string)\n", + " string = re.sub(r\"\\)\", \" \\) \", string)\n", + " string = re.sub(r\"\\?\", \" \\? \", string)\n", + " string = re.sub(r\"\\s{2,}\", \" \", string)\n", + "\n", + " return string.strip().lower()\n", + "\n", + "def collate_batch(batch):\n", + " def preprocess_text(text):\n", + " text_transform = lambda x: [vocabulary[\"\"]]+[vocabulary[token] for token in spacy_tokenizer(x)]+[vocabulary[\"\"]]\n", + " return list(text_transform(clean_str(text)))\n", + " label_list, text_list = [], []\n", + " for (_text, _label) in batch:\n", + " label_transform = lambda x: int(x) - 1\n", + " label_list.append(label_transform(_label))\n", + " processed_text = torch.tensor(preprocess_text(_text))\n", + " text_list.append(processed_text)\n", + " label_list = torch.tensor(label_list, dtype=torch.int64)\n", + " return pad_sequence(text_list, padding_value=pad_index, batch_first=True), label_list\n", + "\n", + "def batch_sampler_v2(batch_size, indices):\n", + " random.shuffle(indices)\n", + " pooled_indices = []\n", + " # create pool of indices with similar lengths \n", + " for i in range(0, len(indices), batch_size * 100):\n", + " pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))\n", + "\n", + " pooled_indices = [x[0] for x in pooled_indices]\n", + "\n", + " # yield indices for current batch\n", + " for i in range(0, len(pooled_indices), batch_size):\n", + " yield pooled_indices[i:i + batch_size]\n", + "\n", + "def train(client_flex_model: FlexModel, client_data: Dataset):\n", + " X_data, y_data = client_data.to_list()\n", + " if 'train_indices' not in client_flex_model:\n", + " train_indices = [(i, len(tokenizer(s[0]))) for i, s in enumerate(X_data)]\n", + " client_flex_model['train_indices'] = train_indices\n", + " else:\n", + " train_indices = client_flex_model['train_indices']\n", + " # batch_size=BATCH_SIZE, shuffle=True, # No es necesario usarlo porque usamos el batch_sampler\n", + " client_dataloader = DataLoader(client_data, collate_fn=collate_batch, batch_size=BATCH_SIZE,\n", + " shuffle=True)\n", + " #  batch_sampler=batch_sampler_v2(BATCH_SIZE, train_indices))\n", + " model = client_flex_model[\"model\"]\n", + " # lr = 0.001\n", + " optimizer = client_flex_model['optimizer_func'](model.parameters(), lr=0.1, **client_flex_model[\"optimizer_kwargs\"])\n", + " model = model.train()\n", + " model = model.to(device)\n", + " criterion = client_flex_model[\"criterion\"]\n", + " # Al usar batch_sampler, hay que recargar el DataLoader en cada epoch.\n", + " for _ in tqdm(range(NUM_EPOCHS)):\n", + " # client_dataloader = DataLoader(client_data, collate_fn=collate_batch,\n", + " # batch_sampler=batch_sampler_v2(BATCH_SIZE, train_indices))\n", + " losses = []\n", + " total_acc, total_count = 0, 0\n", + " for texts, labels in client_dataloader:\n", + " optimizer.zero_grad()\n", + " texts, labels = texts.to(device), labels.to(device)\n", + " predicted_labels = model(texts).squeeze(dim=0)\n", + " # pred = pred.squeeze(dim=0)\n", + " loss = criterion(predicted_labels, labels)\n", + " if predicted_labels.isnan().any():\n", + " print(f\"Text in batch: {texts}\")\n", + " print(f\"Predicted labels in batch: {predicted_labels}\")\n", + " print(f\"Labels in batch: {labels}\")\n", + " print(f\"Loss in batch: {loss}\")\n", + " loss.backward()\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)\n", + " optimizer.step()\n", + " losses.append(loss.item())\n", + " total_acc += (predicted_labels.argmax(1) == labels).sum().item()\n", + " total_count += labels.shape[0]\n", + " total_loss = sum(losses)/len(losses)\n", + " total_acc = total_acc/total_count\n", + " print(f\"Accuracy after epoch: {total_acc}\\t|\\tLoss after epoch: {total_loss}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clients.map(train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After training the model, we have to aggregate the weights from the clients model in order to update the global model. To to so, we are going to use the primitive `collect_clients_weights_pt`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.pool import collect_clients_weights_pt\n", + "\n", + "aggregators.map(collect_clients_weights_pt, clients)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the weights are aggregated, we aggregate them. In this notebook we use the FedAvg method that is already implemented in FLEXible." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.pool import fed_avg\n", + "\n", + "aggregators.map(fed_avg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function `set_aggregated_weights_pt` sed the aggregated weights to the server model to update it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from flex.pool import set_aggregated_weights_pt\n", + "\n", + "aggregators.map(set_aggregated_weights_pt, servers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's turn to evaluate the global model. To do so, we have to create a function using the decoratod `evaluate_server_model`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from flex.pool import evaluate_server_model\n", + "\n", + "@evaluate_server_model\n", + "def evaluate_global_model(server_flex_model: FlexModel, test_data=None):\n", + " model = server_flex_model[\"model\"]\n", + " model.eval()\n", + " test_loss = 0\n", + " test_acc = 0\n", + " total_count = 0\n", + " model = model.to(device)\n", + " criterion=server_flex_model['criterion']\n", + " # get test data as a torchvision object\n", + " test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=True, pin_memory=False, collate_fn=collate_batch)\n", + " X_data, _ = test_dataset.to_list()\n", + " test_indices = [(i, len(tokenizer(s[0]))) for i, s in enumerate(X_data)]\n", + " test_dataloader = DataLoader(test_dataset, collate_fn=collate_batch,\n", + " batch_sampler=batch_sampler_v2(BATCH_SIZE, test_indices))\n", + " losses = []\n", + " with torch.no_grad():\n", + " for data, target in test_dataloader:\n", + " total_count += target.size(0)\n", + " data, target = data.to(device), target.to(device)\n", + " output = model(data).squeeze(dim=0)\n", + " loss = criterion(output, target)\n", + " losses.append(loss.item())\n", + " test_acc += (output.argmax(1) == target).sum().item()\n", + " total_count += target.shape[0]\n", + " # print(f\"Prediciones: {pred.squeeze(dim=1)}\")\n", + " # print(f\"Output: {output.data.max(1, keepdim=True)}\")\n", + " # print(f\"Target: {target}\")\n", + " # print(pred.eq(target.data.view_as(pred)).long().cpu().sum().item())\n", + " # test_acc += pred.eq(target.data.view_as(pred)).long().cpu().sum().item()\n", + " # print(f\"Test accuracy: {test_acc}\")\n", + "\n", + " test_loss = sum(losses) / len(losses)\n", + " test_acc /= total_count\n", + " print(f\"test loss: {test_loss}\")\n", + " print(f\"test acc: {test_acc}\")\n", + " return test_loss, test_acc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = servers.map(evaluate_global_model, test_data=test_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the federated learning experiment for a few rounds\n", + "\n", + "Now, we can summarize the steps provided above and run the federated experiment for multiple rounds:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_n_rounds(n_rounds, clients_per_round=2): \n", + " pool = FlexPool.client_server_pool(fed_dataset=flex_dataset, init_func=build_server_model)\n", + " for i in range(n_rounds):\n", + " print(f\"\\nRunning round: {i+1} of {n_rounds}\")\n", + " selected_clients_pool = pool.clients.select(clients_per_round)\n", + " selected_clients = selected_clients_pool.clients\n", + " print(f\"Selected clients for this round: {len(selected_clients)}\")\n", + " # Deploy the server model to the selected clients\n", + " pool.servers.map(deploy_server_model_pt, selected_clients)\n", + " # Each selected client trains her model\n", + " selected_clients.map(train)\n", + " # The aggregador collects weights from the selected clients and aggregates them\n", + " pool.aggregators.map(collect_clients_weights_pt, selected_clients)\n", + " pool.aggregators.map(fed_avg)\n", + " # The aggregator send its aggregated weights to the server\n", + " pool.aggregators.map(set_aggregated_weights_pt, pool.servers)\n", + " metrics = pool.servers.map(evaluate_global_model, test_data=test_imdb_dataset)\n", + " loss, acc = metrics[0]\n", + " print(f\"Server: Test acc: {acc:.4f}, test loss: {loss:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# train_n_rounds(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "flexible", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/flexnlp/pool/aggregators.py b/flexnlp/pool/aggregators.py new file mode 100644 index 0000000..e69de29 diff --git a/flexnlp/utils/__init__.py b/flexnlp/utils/__init__.py new file mode 100644 index 0000000..768a6b3 --- /dev/null +++ b/flexnlp/utils/__init__.py @@ -0,0 +1,8 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# from flexnlp.utils.collators import default_data_collator_classification +# from flexnlp.utils.collators import classification_sampler +# from flexnlp.utils.collators import ClassificationCollator +from flexnlp.utils import collators diff --git a/flexnlp/utils/collators/__init__.py b/flexnlp/utils/collators/__init__.py new file mode 100644 index 0000000..4262749 --- /dev/null +++ b/flexnlp/utils/collators/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from flexnlp.utils.collators.collate_functions import default_data_collator_classification +from flexnlp.utils.collators.collate_functions import basic_collate_pad_sequence_classification +from flexnlp.utils.collators.classification_sampler import ClassificationCollator \ No newline at end of file diff --git a/flexnlp/utils/collators/classification_sampler.py b/flexnlp/utils/collators/classification_sampler.py new file mode 100644 index 0000000..6695219 --- /dev/null +++ b/flexnlp/utils/collators/classification_sampler.py @@ -0,0 +1,25 @@ +import random +from torch.utils.data.sampler import Sampler + +class ClassificationCollator(Sampler): + """Class the acts as a sampler in a classification problem. + This class will create the indices of the dataset, and will + create the batches with instances of similar length, improving + the performance of the model trained. + """ + def __init__(self, data_source, tokenizer, text_idx, batch_size) -> None: + self._ind_n_len = [(i, len(tokenizer(s[text_idx]))) for i, s in enumerate(data_source)] + self._batch_size = batch_size + + def __iter__(self): + random.shuffle(self._ind_n_len) + pooled_indices = [] + # create pool of indices with similar lengths + for i in range(0, len(self._ind_n_len), self._batch_size * 100): + pooled_indices.extend(sorted(self._ind_n_len[i:i + self._batch_size * 100], key=lambda x: x[1])) + + pooled_indices = [x[0] for x in pooled_indices] + + # yield indices for current batch + for i in range(0, len(pooled_indices), self._batch_size): + yield pooled_indices[i:i + self._batch_size] diff --git a/flexnlp/utils/collators/collate_functions.py b/flexnlp/utils/collators/collate_functions.py new file mode 100644 index 0000000..ba26457 --- /dev/null +++ b/flexnlp/utils/collators/collate_functions.py @@ -0,0 +1,49 @@ +import torch + + +def default_data_collator_classification(batch): + """Default data collator for classification whether + it expects just a batch that contains labels and text. + + This function does not apply any preprocessing to the text, + nor the labels, so they must be preprocessed before via the + torch.data.Dataset. This means, that the functions should recieve + the text already tokenized and converted to ids, and the labels + should be already transformed to the interval 0,..,n, in case + of a classification problem, or as the desired expected type. + + Args: + batch (Batch): Batch with the elements to process. + Returns: + tuple: Tuple the text and the labels of the batch. + """ + label_list, text_list = [], [] + for (_label, _text) in batch: + label_list.append(_label) + text_list.append(torch.tensor(_text)) if not isinstance(_text, torch.tensor) else text_list.append(_text) + label_list = torch.tensor(label_list, dtype=torch.int64) + return text_list, label_list + +def basic_collate_pad_sequence_classification(batch): + """Basic collate function that convert the batches into torch + tensors and returns the text padded. + + This function does not apply any preprocessing to the text, + nor the labels, so they must be preprocessed before via the + torch.data.Dataset. This means, that the functions should recieve + the text already tokenized and converted to ids, and the labels + should be already transformed to the interval 0,..,n, in case + of a classification problem, or as the desired expected type. + + Args: + batch (Batch): Batch with the elements to process. + Returns: + tuple: Tuple containing the text padded with the pad_sequence + function and the labels for the batch. + """ + label_list, text_list = [], [] + for (_label, _text) in batch: + label_list.append(_label) + text_list.append(torch.tensor(_text)) if not isinstance(_text, torch.tensor) else text_list.append(_text) + label_list = torch.tensor(label_list, dtype=torch.int64) + return torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True), label_list diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..94e57c3 --- /dev/null +++ b/setup.py @@ -0,0 +1,55 @@ +from setuptools import find_packages, setup + + +TF_requires = ["tensorflow<2.11", # https://github.com/tensorflow/tensorflow/issues/58973 + "tensorflow_datasets", + "tensorflow_hub" + ] + +PT_requires = ["torch", + "torchvision", + "torchtext", + "torchdata", + "portalocker", + ] + +HF_requires = ["datasets"] + +setup( + name="flexnlp", + version="0.0.1", + author="Cristina Zuheros-Montes and Argente-Garrido Alberto", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + keywords="FL federated-learning flexible NLP", + url="https://github.com/FLEXible-FL/flex-nlp", + packages=find_packages(), + install_requires=["numpy", + "multiprocess", + "scikit-learn", + "cardinality", + "sultan", + "tqdm", + "scipy", + "gdown", + # "flexible", + "torch", + "torchtext", + "portalocker", + "torchdata", + ], + extras_require={ + "tensorflow": TF_requires, + "pytorch": PT_requires, + "hugginface": HF_requires, + "develop": ["pytest", + "pytest-cov", + "pytest-xdist", + "coverage", + "jinja2", + *TF_requires, + *HF_requires + ], + }, + python_requires=">=3.8.10", +)