From 83b0609b4d1dcd8885ad66ba413f3aaec1a0a41d Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Thu, 18 Jan 2024 18:01:34 +0530 Subject: [PATCH 1/4] add notebook bolt --- geniusrise_text/__init__.py | 1 + geniusrise_text/notebook/__init__.py | 16 + geniusrise_text/notebook/notebook.py | 200 ++++++++++ .../templates/AutoModelForCausalLM.jinja | 100 +++++ .../AutoModelForQuestionAnswering.jinja | 107 +++++ .../templates/AutoModelForSeq2SeqLM.jinja | 101 +++++ .../AutoModelForSequenceClassification.jinja | 101 +++++ .../AutoModelForTableQuestionAnswering.jinja | 104 +++++ .../AutoModelForTokenClassification.jinja | 103 +++++ requirements.txt | 365 +++++++++--------- 10 files changed, 1007 insertions(+), 191 deletions(-) create mode 100644 geniusrise_text/notebook/__init__.py create mode 100644 geniusrise_text/notebook/notebook.py create mode 100644 geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja create mode 100644 geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja create mode 100644 geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja create mode 100644 geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja create mode 100644 geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja create mode 100644 geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja diff --git a/geniusrise_text/__init__.py b/geniusrise_text/__init__.py index 2f5cf2d..cfd6aa0 100644 --- a/geniusrise_text/__init__.py +++ b/geniusrise_text/__init__.py @@ -22,3 +22,4 @@ from .qa import QAAPI, QABulk, QAFineTuner from .summarization import SummarizationAPI, SummarizationBulk, SummarizationFineTuner from .translation import TranslationAPI, TranslationBulk, TranslationFineTuner +from .notebook import TextJupyterNotebook diff --git a/geniusrise_text/notebook/__init__.py b/geniusrise_text/notebook/__init__.py new file mode 100644 index 0000000..6d0778e --- /dev/null +++ b/geniusrise_text/notebook/__init__.py @@ -0,0 +1,16 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .notebook import TextJupyterNotebook diff --git a/geniusrise_text/notebook/notebook.py b/geniusrise_text/notebook/notebook.py new file mode 100644 index 0000000..d698d0d --- /dev/null +++ b/geniusrise_text/notebook/notebook.py @@ -0,0 +1,200 @@ +# 🧠 Geniusrise +# Copyright (C) 2023 geniusrise.ai +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys +from jinja2 import Environment, FileSystemLoader +from nbformat import v4 as nbf +from geniusrise import BatchInput, BatchOutput, Bolt, State +from geniusrise.logging import setup_logger +from typing import Any, Dict, List, Optional + + +class TextJupyterNotebook(Bolt): + def __init__( + self, + input: BatchInput, + output: BatchOutput, + state: State, + **kwargs, + ): + super().__init__(input=input, output=output, state=state) + self.log = setup_logger(self) + + def create( + self, + model_name: str, + tokenizer_name: str, + model_revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, + model_class: str = "AutoModelForCausalLM", + tokenizer_class: str = "AutoTokenizer", + use_cuda: bool = False, + precision: str = "float16", + quantization: int = 0, + device_map: str | Dict | None = "auto", + torchscript: bool = False, + compile: bool = True, + awq_enabled: bool = False, + flash_attention: bool = False, + port: int = 8888, + password: Optional[str] = None, + **model_args: Any, + ): + self.model_name = model_name + self.tokenizer_name = tokenizer_name + self.model_revision = model_revision + self.tokenizer_revision = tokenizer_revision + self.model_class = model_class + self.tokenizer_class = tokenizer_class + self.use_cuda = use_cuda + self.precision = precision + self.quantization = quantization + self.device_map = device_map + self.torchscript = torchscript + self.compile = compile + self.awq_enabled = awq_enabled + self.flash_attention = flash_attention + self.model_args = model_args + + self.env = Environment(loader=FileSystemLoader("./templates")) + + # Context for Jinja template + context = { + "model_name": model_name, + "tokenizer_name": tokenizer_name, + "model_revision": model_revision, + "tokenizer_revision": tokenizer_revision, + "model_class": model_class, + "tokenizer_class": tokenizer_class, + "use_cuda": use_cuda, + "precision": precision, + "quantization": quantization, + "device_map": device_map, + "torchscript": torchscript, + "compile": compile, + "awq_enabled": awq_enabled, + "flash_attention": flash_attention, + "model_args": model_args, + } + output_path = self.output.output_folder + class_to_template_map = { + "AutoModelForCausalLM": "./templates/AutoModelForCausalLM.jinja", + "AutoModelForTokenClassification": "./templates/AutoModelForTokenClassification.jinja", + "AutoModelForSequenceClassification": "./templates/AutoModelForSequenceClassification.jinja", + "AutoModelForTableQuestionAnswering": "./templates/AutoModelForTableQuestionAnswering.jinja", + "AutoModelForQuestionAnswering": "./templates/AutoModelForQuestionAnswering.jinja", + "AutoModelForSeq2SeqLM": "./templates/AutoModelForSeq2SeqLM.jinja", + } + + template_name = class_to_template_map[model_class] + + self.create_notebook(name=template_name, context=context, output_path=f"{output_path}/notebook.ipynb") + + self.install_packages( + [ + "numpy==1.21.6", + "scikit-learn==1.3.0", + "pandas==1.3.5", + "matplotlib-inline==0.1.6", + "seaborn==0.13.1", + "torch==2.1.2", + "tensorflow==2.15.0", + "transformers", + "datasets", + "evaluate", + "diffusers", + "nemo_toolkit[all]", + "jupyterthemes", + "jupyter==1.0.0", + ] + ) + self.install_jupyter_extensions( + [ + "jupyter_contrib_nbextensions", + "jupyter_nbextensions_configurator", + "jupyter_tensorboard", + "rise", + "nbdime", + ] + ) + self.enable_jupyter_dark_theme() + + self.start_jupyter_server(notebook_dir=output_path, port=port, password=password) + + def create_notebook(self, name: str, context: dict, output_path: str): + """ + Create a Jupyter Notebook from a Jinja template. + + Args: + context (dict): Context variables to render the template. + output_path (str): Path to save the generated notebook. + """ + template = self.env.get_template(name) + notebook_json = template.render(context) + notebook = nbf.reads(notebook_json, as_version=4) + + with open(output_path, "w") as f: + nbf.write(notebook, f) + self.log.info(f"Notebook created at {output_path}") + + def start_jupyter_server(self, notebook_dir: str, port: int = 8888, password: Optional[str] = None): + """ + Start a Jupyter Notebook server in the specified directory with an optional port and password. + + Args: + notebook_dir (str): Directory where the notebook server should start. + port (int): Port number for the notebook server. Default is 8888. + password (Optional[str]): Password for accessing the notebook server. If None, no password is set. + """ + command = ["jupyter", "notebook", "--notebook-dir", notebook_dir, "--port", str(port)] + + if password: + from notebook.auth import passwd + + hashed_password = passwd(password) + command.extend(["--NotebookApp.password", f"'{hashed_password}'"]) + + subprocess.run(command, check=True) + + def install_packages(self, packages: List[str]): + """ + Install Python packages using pip. + + Args: + packages (List[str]): List of package names to install. + """ + for package in packages: + subprocess.run([sys.executable, "-m", "pip", "install", package], check=True) + self.log.info("Required packages installed.") + + def install_jupyter_extensions(self, extensions: List[str]): + """ + Install Jupyter Notebook extensions. + + Args: + extensions (List[str]): List of Jupyter extension names to install. + """ + for extension in extensions: + subprocess.run(["jupyter", "nbextension", "install", extension, "--user"], check=True) + subprocess.run(["jupyter", "nbextension", "enable", extension, "--user"], check=True) + self.log.info("Jupyter extensions installed and enabled.") + + def enable_jupyter_dark_theme(self): + """ + Enable dark theme for Jupyter Notebook. + """ + subprocess.run(["jt", "-t", "onedork"], check=True) # Example: using 'onedork' theme from jt (jupyterthemes) + self.log.info("Jupyter dark theme enabled.") diff --git a/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja b/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja new file mode 100644 index 0000000..ad3f7be --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForCausalLM.jinja @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates how to load and use the `{{ model_class }}` from Hugging Face's Transformers library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Here we load the model and tokenizer. We are using the model `{{ model_name }}` and its corresponding tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loading the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Additional configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Inference\n", + "\n", + "Now, let's use the model to generate some text. We will provide a prompt, and the model will generate a continuation of it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generating text\n", + "prompt = 'Today is a beautiful day'\n", + "inputs = tokenizer(prompt, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Generate a response\n", + "with torch.no_grad():\n", + " outputs = model.generate(**inputs)\n", + " generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "print(generated_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja b/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja new file mode 100644 index 0000000..b9ac2f4 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForQuestionAnswering.jinja @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates the use of `{{ model_class }}` from the Hugging Face Transformers library for question answering tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Load the model `{{ model_name }}` and its corresponding tokenizer for question answering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question Answering\n", + "\n", + "Let's demonstrate how to use the model for question answering. We'll provide a context and a question, and the model will find the answer within the context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example context and question for demonstration\n", + "context = '...' # Define your context here\n", + "question = '...' # Define your question here\n", + "\n", + "# Tokenize and encode the context and question for the model\n", + "inputs = tokenizer.encode_plus(question, context, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Perform question answering\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits\n", + " answer_start = torch.argmax(answer_start_scores)\n", + " answer_end = torch.argmax(answer_end_scores) + 1\n", + " answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))\n", + "\n", + "# Display the answer\n", + "print(answer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja b/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja new file mode 100644 index 0000000..c770a50 --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForSeq2SeqLM.jinja @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates the use of `{{ model_class }}` from the Hugging Face Transformers library for sequence-to-sequence tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Load the model `{{ model_name }}` and its corresponding tokenizer for sequence-to-sequence tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sequence-to-Sequence Task\n", + "\n", + "Now, let's demonstrate how to use the model for a sequence-to-sequence task. We'll provide a source sequence, and the model will generate the target sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example source sequence for sequence-to-sequence task\n", + "source_sequence = '...' # Define your source sequence here\n", + "inputs = tokenizer(source_sequence, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Generate the target sequence\n", + "with torch.no_grad():\n", + " outputs = model.generate(**inputs)\n", + " target_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "# Display the target sequence\n", + "print(target_sequence)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja b/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja new file mode 100644 index 0000000..435269d --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForSequenceClassification.jinja @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates the use of `{{ model_class }}` from the Hugging Face Transformers library for sequence classification tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Load the model `{{ model_name }}` and its corresponding tokenizer for sequence classification." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sequence Classification\n", + "\n", + "We will now demonstrate how to use the model for sequence classification. Let's classify the sentiment of an example sentence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example sentence for sequence classification\n", + "sentence = 'I love using transformers for natural language processing.'\n", + "inputs = tokenizer(sentence, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Perform sequence classification\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=-1)\n", + "\n", + "# Process and display the results\n", + "print(f'Sentence sentiment: {model.config.id2label[predictions.item()]}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja b/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja new file mode 100644 index 0000000..20d55ed --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForTableQuestionAnswering.jinja @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates the use of `{{ model_class }}` from Hugging Face Transformers library for table-based question answering tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Load the model `{{ model_name }}` and its corresponding tokenizer for table question answering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table-based Question Answering\n", + "\n", + "Now, let's demonstrate how to use the model for answering questions based on tabular data. We will provide a table and a question, and the model will find the answer from the table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example table and question for demonstration\n", + "table_data = '...' # Define or load your table data here\n", + "question = 'What is the ...?' # Define your question here\n", + "\n", + "# Tokenize and encode the table and question for the model\n", + "inputs = tokenizer(table=table_data, queries=question, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Perform question answering\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " answer = tokenizer.decode(outputs[0])\n", + "\n", + "# Display the answer\n", + "print(answer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja b/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja new file mode 100644 index 0000000..ce05c8a --- /dev/null +++ b/geniusrise_text/notebook/templates/AutoModelForTokenClassification.jinja @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# {{ model_class }} Demonstration\n", + "\n", + "This notebook demonstrates the use of `{{ model_class }}` from the Hugging Face Transformers library for token classification tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "from transformers import {{ model_class }}, {{ tokenizer_class }}\n", + "import torch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Model and Tokenizer\n", + "\n", + "Load the model `{{ model_name }}` and its corresponding tokenizer for token classification." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model = {{ model_class }}.from_pretrained('{{ model_name }}', revision='{{ model_revision }}')\n", + "tokenizer = {{ tokenizer_class }}.from_pretrained('{{ tokenizer_name }}', revision='{{ tokenizer_revision }}')\n", + "\n", + "# Configurations\n", + "model.to('cuda' if torch.cuda.is_available() and {{ use_cuda }} else 'cpu')\n", + "if '{{ precision }}' == 'float16':\n", + " model = model.half()\n", + "\n", + "# Describe each configuration and its impact here..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Token Classification\n", + "\n", + "We will now demonstrate how to use the model for token classification. Let's take an example sentence and classify each token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example sentence for token classification\n", + "sentence = 'Hugging Face is a technology company based in New York.'\n", + "inputs = tokenizer(sentence, return_tensors='pt')\n", + "inputs.to(model.device)\n", + "\n", + "# Perform token classification\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=-1)\n", + "\n", + "# Process and display the results\n", + "tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])\n", + "for token, prediction in zip(tokens, predictions[0]):\n", + " print(f'{token}: {model.config.id2label[prediction.item()]}')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt index b1fe2c4..2d5a35c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,132 +1,105 @@ absl-py==1.4.0 accelerate==0.22.0 -aiohttp==3.8.6 +aiohttp==3.8.5 aiosignal==1.3.1 -alembic==1.13.0 -annotated-types==0.6.0 +alabaster==0.7.16 +aniso8601==9.0.1 +annotated-types==0.5.0 ansicolors==1.1.8 -anyio==4.1.0 -apache-airflow==2.8.0 -apache-airflow-providers-common-sql==1.9.0 -apache-airflow-providers-docker==3.8.2 -apache-airflow-providers-ftp==3.7.0 -apache-airflow-providers-http==4.8.0 -apache-airflow-providers-imap==3.5.0 -apache-airflow-providers-sqlite==3.6.0 -apache-beam==2.48.0 -apache-flink==1.18.0 -apache-flink-libraries==1.18.0 -apispec==6.3.0 -argcomplete==3.2.1 +antlr4-python3-runtime==4.9.3 +appdirs==1.4.4 argparse-color-formatter==1.2.2.post2 argparse-manpage==4.4 -asgiref==3.7.2 +asciitree==0.3.3 asttokens==2.4.0 async-timeout==4.0.3 -attributedict==0.3.0 +attrdict==2.0.1 attrs==23.1.0 +audioread==3.0.1 auto-gptq==0.4.2 -autoawq==0.1.8 autocommand==2.2.2 autoflake==2.2.1 -avro-python3==1.9.2.1 Babel==2.14.0 backcall==0.2.0 -backoff==2.2.1 +beautifulsoup4==4.12.3 bitsandbytes==0.41.1 -black==23.12.1 +black==19.10b0 bleach==6.0.0 -blessings==1.7 blinker==1.7.0 -boto3==1.34.7 -botocore==1.34.7 +boto3==1.28.25 +botocore==1.31.25 +braceexpand==0.1.7 build==0.10.0 -cachelib==0.9.0 cachetools==5.3.1 -certifi==2023.11.17 -cffi==1.16.0 -chardet==5.2.0 -charset-normalizer==3.3.2 +cdifflib==1.2.6 +certifi==2023.7.22 +cffi==1.15.1 +charset-normalizer==3.2.0 cheroot==10.0.0 CherryPy==18.8.0 -click==8.1.7 -clickclick==20.10.2 -cloudpickle==2.2.1 +click==8.0.2 cmake==3.27.2 -codecov==2.1.13 colorama==0.4.6 coloredlogs==15.0.1 -colorlog==4.8.0 -colour-runner==0.1.1 -ConfigUpdater==3.2 -connexion==2.14.2 -coverage==7.3.4 -crcmod==1.7 -cron-descriptor==1.4.0 -croniter==2.0.1 -cryptography==41.0.7 -DataProperty==1.0.1 -datasets==2.16.0 +colorlog==6.7.0 +comm==0.2.1 +contourpy==1.2.0 +coverage==7.3.0 +cryptography==41.0.3 +cycler==0.12.1 +Cython==3.0.8 +datasets==2.14.4 decorator==5.1.1 -deepdiff==6.7.1 -Deprecated==1.2.14 -dill==0.3.1.1 +dill==0.3.7 direnv==2020.12.3 -distlib==0.3.8 -dnspython==2.4.2 -docker==7.0.0 +Distance==0.1.3 +docker-pycreds==0.4.0 docopt==0.6.2 docstring-parser==0.15 docutils==0.20.1 +editdistance==0.6.2 einops==0.7.0 -email-validator==1.3.1 emoji==2.7.0 env-file==2020.12.3 et-xmlfile==1.1.0 -evaluate==0.4.1 -exceptiongroup==1.2.0 +evaluate==0.4.0 +exceptiongroup==1.1.2 executing==1.2.0 -fastavro==1.4.7 +faiss-cpu==1.7.4 fasteners==0.19 -filelock==3.13.1 -find-libpython==0.3.0 +fastjsonschema==2.19.1 +fasttext==0.9.2 +filelock==3.12.2 flake8==6.1.0 -flash-attn==2.3.3 +flash-attn==2.3.4 Flask==2.2.5 -Flask-AppBuilder==4.3.10 -Flask-Babel==2.0.0 -Flask-Caching==2.1.0 -Flask-JWT-Extended==4.6.0 -Flask-Limiter==3.5.0 -Flask-Login==0.6.3 -Flask-Session==0.5.0 -Flask-SQLAlchemy==2.5.1 -Flask-WTF==1.2.1 -frozenlist==1.4.1 -geniusrise==0.0.33 +Flask-RESTful==0.3.10 +fonttools==4.47.2 +frozenlist==1.4.0 +fsspec==2023.6.0 +ftfy==6.1.3 +g2p-en==2.1.0 +gdown==4.7.3 +geniusrise==0.0.16 +gitdb==4.0.11 +GitPython==3.1.41 google-auth==2.17.3 -google-re2==1.1 -googleapis-common-protos==1.62.0 +google-auth-oauthlib==1.2.0 GPUtil==1.4.0 -graphviz==0.20.1 -greenlet==3.0.2 grpcio==1.60.0 -gunicorn==21.2.0 -h11==0.14.0 -hdfs==2.7.2 -httpcore==0.16.3 -httplib2==0.20.4 -httpx==0.23.3 -huggingface-hub==0.20.1 +h5py==3.10.0 +huggingface-hub==0.20.2 humanfriendly==10.0 -idna==3.6 -importlib-metadata==6.11.0 -importlib-resources==6.1.1 +hydra-core==1.3.2 +idna==3.4 +ijson==3.2.3 +imagesize==1.4.1 +importlib-metadata==6.8.0 inflect==7.0.0 -inflection==0.5.1 iniconfig==2.0.0 -inspecta==0.1.3 ipython==8.15.0 +ipywidgets==8.1.1 +isort==5.13.2 itsdangerous==2.1.2 jaraco.classes==3.3.0 jaraco.collections==4.3.0 @@ -135,46 +108,58 @@ jaraco.functools==3.9.0 jaraco.text==3.11.1 jedi==0.19.0 jeepney==0.8.0 +jieba==0.42.1 Jinja2==3.1.2 +jiwer==2.5.2 jmespath==0.10.0 joblib==1.3.2 -jsonlines==4.0.0 jsonpickle==3.0.1 -jsonschema==4.20.0 -jsonschema-specifications==2023.11.2 +jsonschema==4.21.0 +jsonschema-specifications==2023.12.1 +jupyter_core==5.7.1 +jupyterlab-widgets==3.0.9 kafka-python==2.0.2 +kaldi-python-io==1.2.2 +kaldiio==2.18.0 keyring==24.2.0 -kubernetes==28.1.0 -lazy-object-proxy==1.10.0 -limits==3.7.0 -linkify-it-py==2.0.2 +kiwisolver==1.4.5 +kornia==0.7.1 +kubernetes==27.2.0 +latexcodec==2.0.1 +lazy_loader==0.3 +Levenshtein==0.22.0 +librosa==0.10.1 +lightning-utilities==0.9.0 lit==16.0.6 -lm_eval==0.4.0 -lockfile==0.12.2 +llvmlite==0.41.1 +loguru==0.7.2 lxml==4.9.3 -Mako==1.3.0 -Markdown==3.5.1 +Markdown==3.5.2 markdown-it-py==3.0.0 +markdown2==2.4.12 MarkupSafe==2.1.3 -marshmallow==3.20.1 -marshmallow-oneofschema==3.0.1 -marshmallow-sqlalchemy==0.26.1 +marshmallow==3.20.2 +matplotlib==3.8.2 matplotlib-inline==0.1.6 -mbstrdecoder==1.1.3 mccabe==0.7.0 -mdit-py-plugins==0.4.0 mdurl==0.1.2 +megatron-core==0.4.0 more-itertools==10.1.0 mpmath==1.3.0 +msgpack==1.0.7 multidict==6.0.4 multiprocess==0.70.15 -mypy==1.8.0 +mypy==1.5.0 mypy-extensions==1.0.0 +nbformat==5.9.2 +nemo-text-processing==0.2.2rc0 +nemo_toolkit==1.22.0 networkx==3.1 ninja==1.11.1.1 nltk==3.8.1 -numexpr==2.8.8 -numpy==1.21.6 +numba==0.58.1 +numcodecs==0.12.1 +numpy==1.23.5 nvidia-cublas-cu11==11.10.3.66 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu11==11.7.101 @@ -199,163 +184,161 @@ nvidia-nvjitlink-cu12==12.3.52 nvidia-nvtx-cu11==11.7.91 nvidia-nvtx-cu12==12.1.105 oauthlib==3.2.2 -objsize==0.6.1 +omegaconf==2.3.0 +onnx==1.15.0 +OpenCC==1.1.6 openpyxl==3.1.2 -opentelemetry-api==1.22.0 -opentelemetry-exporter-otlp==1.22.0 -opentelemetry-exporter-otlp-proto-common==1.22.0 -opentelemetry-exporter-otlp-proto-grpc==1.22.0 -opentelemetry-exporter-otlp-proto-http==1.22.0 -opentelemetry-proto==1.22.0 -opentelemetry-sdk==1.22.0 -opentelemetry-semantic-conventions==0.43b0 optimum==1.13.2 -ordered-set==4.1.0 -orjson==3.9.7 -packaging==23.2 -pandas==1.3.5 +packaging==23.1 +pandas==2.0.3 +pangu==4.0.6.1 +parameterized==0.9.0 parso==0.8.3 -pathspec==0.12.1 -pathvalidate==3.2.0 +pathspec==0.11.2 peft==0.5.0 -pemja==0.3.0 -pendulum==2.1.2 pexpect==4.8.0 pickleshare==0.7.5 Pillow==10.1.0 -pip-autoremove==0.10.0 -pipdeptree==2.13.1 pkginfo==1.9.6 +plac==1.4.2 platformdirs==3.10.0 -pluggy==1.3.0 +pluggy==1.2.0 +pooch==1.8.0 portalocker==2.7.0 portend==3.2.0 -prettytable==3.9.0 -prison==0.2.1 -prometheus-client==0.19.0 +prettytable==3.8.0 +progress==1.6 +prometheus-client==0.17.1 prompt-toolkit==3.0.39 -proto-plus==1.22.3 -protobuf==3.20.3 -psutil==5.9.6 -psycopg2==2.9.9 +protobuf==4.23.4 +psutil==5.9.5 +psycopg2==2.9.7 ptyprocess==0.7.0 pure-eval==0.2.2 -py4j==0.10.9.7 -pyarrow==8.0.0 -pyarrow-hotfix==0.6 +pyannote.core==5.0.0 +pyannote.database==5.0.1 +pyannote.metrics==3.2.1 +pyarrow==13.0.0 pyasn1==0.5.0 pyasn1-modules==0.3.0 pybind11==2.11.1 +pybtex==0.24.0 +pybtex-docutils==1.0.3 pycodestyle==2.11.0 pycparser==2.21 -pydantic==2.5.2 -pydantic_core==2.14.5 -pydot==1.4.2 +pydantic==1.10.13 +pydantic_core==2.4.0 +pydub==0.25.1 pyflakes==3.1.0 -Pygments==2.17.2 -PyJWT==2.8.0 -pymongo==3.13.0 +Pygments==2.16.1 +pynini==2.1.5 pyparsing==3.1.1 -pyproject-api==1.6.1 +pypinyin==0.50.0 +pypinyin-dict==0.7.0 pyproject_hooks==1.0.0 -pyspark==3.5.0 -pytablewriter==1.2.0 -pytest==7.4.3 -pytest-asyncio==0.21.1 +PySocks==1.7.1 +pytest==7.4.0 pytest-cov==4.1.0 -python-daemon==3.0.1 +pytest-runner==6.0.1 python-dateutil==2.8.2 -python-dotenv==1.0.0 -python-nvd3==0.15.0 -python-slugify==8.0.1 -pytz==2023.3.post1 -pytzdata==2020.1 +pytorch-lightning==2.0.7 +pytz==2023.3 PyYAML==6.0.1 +rapidfuzz==2.13.7 readme-renderer==40.0 -redis==5.0.1 -referencing==0.32.0 +redis==4.6.0 +referencing==0.32.1 regex==2023.8.8 requests==2.31.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 +resampy==0.4.2 responses==0.18.0 retrying==1.3.4 -rfc3339-validator==0.1.4 -rfc3986==1.5.0 -rich==13.7.0 -rich-argparse==1.4.0 -rootpath==0.1.1 +rfc3986==2.0.0 +rich==13.5.2 +rich-argparse==1.3.0 rouge==1.0.1 rouge-score==0.1.2 -rpds-py==0.13.2 +rpds-py==0.17.1 rsa==4.9 -s3transfer==0.10.0 +ruamel.yaml==0.18.5 +ruamel.yaml.clib==0.2.8 +s3transfer==0.6.1 sacrebleu==2.3.1 sacremoses==0.0.53 safetensors==0.3.3 scikit-learn==1.3.0 scipy==1.11.2 +seaborn==0.13.1 SecretStorage==3.3.3 sentence-transformers==2.2.2 sentencepiece==0.1.99 +sentry-sdk==1.39.2 setproctitle==1.3.3 +shellingham==1.5.4 shortuuid==1.0.11 shtab==1.6.4 six==1.16.0 -sniffio==1.3.0 -spark==0.2.1 -SQLAlchemy==1.4.50 -SQLAlchemy-JSONField==1.0.2 -SQLAlchemy-Utils==0.41.1 -sqlitedict==2.1.0 -sqlparse==0.4.4 +smmap==5.0.1 +snowballstemmer==2.2.0 +sortedcontainers==2.4.0 +soundfile==0.12.1 +soupsieve==2.5 +sox==1.4.1 +soxr==0.3.7 +Sphinx==7.2.6 +sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-bibtex==2.6.2 +sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-serializinghtml==1.1.10 stack-data==0.6.2 -streamz==0.6.4 sympy==1.12 -tabledata==1.3.3 tabulate==0.9.0 -tcolorpy==0.1.4 tempora==5.5.0 -tenacity==8.2.3 -termcolor==2.4.0 +tensorboard==2.15.1 +tensorboard-data-server==0.7.2 +tensorstore==0.1.45 +termcolor==2.3.0 text-unidecode==1.3 -texttable==1.7.0 +textdistance==4.6.1 +texterrors==0.4.4 threadpoolctl==3.2.0 tokenizers==0.15.0 toml==0.10.2 tomli==2.0.1 -toolz==0.12.0 -torch==2.1.2 +torch==2.1.0 +torchmetrics==1.3.0.post0 torchvision==0.16.0 -tornado==6.3.3 -tox==4.11.4 tqdm==4.66.1 -tqdm-multiprocess==0.0.11 traitlets==5.9.0 transformers==4.36.2 triton==2.1.0 trl==0.7.2 twine==4.0.2 -typepy==1.3.2 +typed-ast==1.5.5 +typer==0.9.0 types-PyYAML==6.0.12.11 -typing_extensions==4.9.0 +typing_extensions==4.7.1 tyro==0.5.10 tzdata==2023.3 -uc-micro-py==1.0.2 -unicodecsv==0.14.1 -universal-pathlib==0.1.4 urllib3==1.26.16 values==2020.12.3 -virtualenv==20.25.0 -wcwidth==0.2.6 +wandb==0.16.2 +wcwidth==0.2.13 +webdataset==0.1.62 webencodings==0.5.1 websocket-client==1.6.1 -Werkzeug==2.2.3 +Werkzeug==3.0.1 +wget==3.2 +widgetsnbextension==4.0.9 wrapt==1.16.0 -WTForms==3.1.1 xxhash==3.3.0 -yarl==1.9.4 +yarl==1.9.2 +youtokentome==1.0.6 +zarr==2.16.1 zc.lockfile==3.0.post1 -zict==3.0.0 -zipp==3.17.0 -zstandard==0.21.0 +zipp==3.16.2 From e70f305fbb0dc232ce7aeac528d25b1077d63a94 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Fri, 19 Jan 2024 01:52:30 +0530 Subject: [PATCH 2/4] fixes --- geniusrise_text/base/api.py | 3 +- geniusrise_text/notebook/notebook.py | 103 +++++--- requirements.txt | 365 ++++++++++++++------------- 3 files changed, 262 insertions(+), 209 deletions(-) diff --git a/geniusrise_text/base/api.py b/geniusrise_text/base/api.py index 82e65e2..d688fcf 100644 --- a/geniusrise_text/base/api.py +++ b/geniusrise_text/base/api.py @@ -212,7 +212,8 @@ def listen( else: model_revision = None tokenizer_revision = None - tokenizer_name = model_name + tokenizer_name = model_name + self.model_name = model_name self.model_revision = model_revision self.tokenizer_name = tokenizer_name diff --git a/geniusrise_text/notebook/notebook.py b/geniusrise_text/notebook/notebook.py index d698d0d..9f21d0c 100644 --- a/geniusrise_text/notebook/notebook.py +++ b/geniusrise_text/notebook/notebook.py @@ -13,10 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import subprocess import sys -from jinja2 import Environment, FileSystemLoader +from jinja2 import Environment, FileSystemLoader, Template from nbformat import v4 as nbf +import nbformat from geniusrise import BatchInput, BatchOutput, Bolt, State from geniusrise.logging import setup_logger from typing import Any, Dict, List, Optional @@ -32,13 +34,15 @@ def __init__( ): super().__init__(input=input, output=output, state=state) self.log = setup_logger(self) + script_dir = os.path.dirname(os.path.realpath(__file__)) + templates_dir = os.path.join(script_dir, "templates") + + # Initialize Jinja2 Environment with the correct templates directory + self.env = Environment(loader=FileSystemLoader(templates_dir)) def create( self, model_name: str, - tokenizer_name: str, - model_revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, model_class: str = "AutoModelForCausalLM", tokenizer_class: str = "AutoTokenizer", use_cuda: bool = False, @@ -53,10 +57,6 @@ def create( password: Optional[str] = None, **model_args: Any, ): - self.model_name = model_name - self.tokenizer_name = tokenizer_name - self.model_revision = model_revision - self.tokenizer_revision = tokenizer_revision self.model_class = model_class self.tokenizer_class = tokenizer_class self.use_cuda = use_cuda @@ -69,6 +69,21 @@ def create( self.flash_attention = flash_attention self.model_args = model_args + if ":" in model_name: + model_revision = model_name.split(":")[1] + tokenizer_revision = model_name.split(":")[1] + model_name = model_name.split(":")[0] + tokenizer_name = model_name + else: + model_revision = None + tokenizer_revision = None + tokenizer_name = model_name + + self.model_name = model_name + self.model_revision = model_revision + self.tokenizer_name = tokenizer_name + self.tokenizer_revision = tokenizer_revision + self.env = Environment(loader=FileSystemLoader("./templates")) # Context for Jinja template @@ -89,15 +104,25 @@ def create( "flash_attention": flash_attention, "model_args": model_args, } + + import os + + dir_path = os.path.dirname(os.path.realpath(__file__)) + output_path = self.output.output_folder + + script_dir = os.path.dirname(os.path.abspath(__file__)) + templates_dir = os.path.join(script_dir, "templates") + # fmt: off class_to_template_map = { - "AutoModelForCausalLM": "./templates/AutoModelForCausalLM.jinja", - "AutoModelForTokenClassification": "./templates/AutoModelForTokenClassification.jinja", - "AutoModelForSequenceClassification": "./templates/AutoModelForSequenceClassification.jinja", - "AutoModelForTableQuestionAnswering": "./templates/AutoModelForTableQuestionAnswering.jinja", - "AutoModelForQuestionAnswering": "./templates/AutoModelForQuestionAnswering.jinja", - "AutoModelForSeq2SeqLM": "./templates/AutoModelForSeq2SeqLM.jinja", + "AutoModelForCausalLM": os.path.join(templates_dir, "AutoModelForCausalLM.jinja"), + "AutoModelForTokenClassification": os.path.join(templates_dir, "AutoModelForTokenClassification.jinja"), + "AutoModelForSequenceClassification": os.path.join(templates_dir, "AutoModelForSequenceClassification.jinja"), + "AutoModelForTableQuestionAnswering": os.path.join(templates_dir, "AutoModelForTableQuestionAnswering.jinja"), + "AutoModelForQuestionAnswering": os.path.join(templates_dir, "AutoModelForQuestionAnswering.jinja"), + "AutoModelForSeq2SeqLM": os.path.join(templates_dir, "AutoModelForSeq2SeqLM.jinja"), } + # fmt: on template_name = class_to_template_map[model_class] @@ -121,15 +146,15 @@ def create( "jupyter==1.0.0", ] ) - self.install_jupyter_extensions( - [ - "jupyter_contrib_nbextensions", - "jupyter_nbextensions_configurator", - "jupyter_tensorboard", - "rise", - "nbdime", - ] - ) + # self.install_jupyter_extensions( + # [ + # "jupyter_contrib_nbextensions", + # "jupyter_nbextensions_configurator", + # "jupyter_tensorboard", + # "rise", + # "nbdime", + # ] + # ) self.enable_jupyter_dark_theme() self.start_jupyter_server(notebook_dir=output_path, port=port, password=password) @@ -142,12 +167,17 @@ def create_notebook(self, name: str, context: dict, output_path: str): context (dict): Context variables to render the template. output_path (str): Path to save the generated notebook. """ - template = self.env.get_template(name) + # template = self.env.get_template(name) + with open(name, "r") as file: + template_content = file.read() + + template = Template(template_content) + notebook_json = template.render(context) - notebook = nbf.reads(notebook_json, as_version=4) + notebook = nbf.reads(notebook_json) with open(output_path, "w") as f: - nbf.write(notebook, f) + nbformat.write(notebook, f) self.log.info(f"Notebook created at {output_path}") def start_jupyter_server(self, notebook_dir: str, port: int = 8888, password: Optional[str] = None): @@ -159,15 +189,20 @@ def start_jupyter_server(self, notebook_dir: str, port: int = 8888, password: Op port (int): Port number for the notebook server. Default is 8888. password (Optional[str]): Password for accessing the notebook server. If None, no password is set. """ - command = ["jupyter", "notebook", "--notebook-dir", notebook_dir, "--port", str(port)] - - if password: - from notebook.auth import passwd - - hashed_password = passwd(password) - command.extend(["--NotebookApp.password", f"'{hashed_password}'"]) - subprocess.run(command, check=True) + command = [ + "jupyter", + "notebook", + "--password", + password, + "--no-browser", + "--port", + str(port), + "--notebook-dir", + notebook_dir, + ] + + subprocess.run(command, check=True) # type: ignore def install_packages(self, packages: List[str]): """ diff --git a/requirements.txt b/requirements.txt index 2d5a35c..b1fe2c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,105 +1,132 @@ absl-py==1.4.0 accelerate==0.22.0 -aiohttp==3.8.5 +aiohttp==3.8.6 aiosignal==1.3.1 -alabaster==0.7.16 -aniso8601==9.0.1 -annotated-types==0.5.0 +alembic==1.13.0 +annotated-types==0.6.0 ansicolors==1.1.8 -antlr4-python3-runtime==4.9.3 -appdirs==1.4.4 +anyio==4.1.0 +apache-airflow==2.8.0 +apache-airflow-providers-common-sql==1.9.0 +apache-airflow-providers-docker==3.8.2 +apache-airflow-providers-ftp==3.7.0 +apache-airflow-providers-http==4.8.0 +apache-airflow-providers-imap==3.5.0 +apache-airflow-providers-sqlite==3.6.0 +apache-beam==2.48.0 +apache-flink==1.18.0 +apache-flink-libraries==1.18.0 +apispec==6.3.0 +argcomplete==3.2.1 argparse-color-formatter==1.2.2.post2 argparse-manpage==4.4 -asciitree==0.3.3 +asgiref==3.7.2 asttokens==2.4.0 async-timeout==4.0.3 -attrdict==2.0.1 +attributedict==0.3.0 attrs==23.1.0 -audioread==3.0.1 auto-gptq==0.4.2 +autoawq==0.1.8 autocommand==2.2.2 autoflake==2.2.1 +avro-python3==1.9.2.1 Babel==2.14.0 backcall==0.2.0 -beautifulsoup4==4.12.3 +backoff==2.2.1 bitsandbytes==0.41.1 -black==19.10b0 +black==23.12.1 bleach==6.0.0 +blessings==1.7 blinker==1.7.0 -boto3==1.28.25 -botocore==1.31.25 -braceexpand==0.1.7 +boto3==1.34.7 +botocore==1.34.7 build==0.10.0 +cachelib==0.9.0 cachetools==5.3.1 -cdifflib==1.2.6 -certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 +certifi==2023.11.17 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.2 cheroot==10.0.0 CherryPy==18.8.0 -click==8.0.2 +click==8.1.7 +clickclick==20.10.2 +cloudpickle==2.2.1 cmake==3.27.2 +codecov==2.1.13 colorama==0.4.6 coloredlogs==15.0.1 -colorlog==6.7.0 -comm==0.2.1 -contourpy==1.2.0 -coverage==7.3.0 -cryptography==41.0.3 -cycler==0.12.1 -Cython==3.0.8 -datasets==2.14.4 +colorlog==4.8.0 +colour-runner==0.1.1 +ConfigUpdater==3.2 +connexion==2.14.2 +coverage==7.3.4 +crcmod==1.7 +cron-descriptor==1.4.0 +croniter==2.0.1 +cryptography==41.0.7 +DataProperty==1.0.1 +datasets==2.16.0 decorator==5.1.1 -dill==0.3.7 +deepdiff==6.7.1 +Deprecated==1.2.14 +dill==0.3.1.1 direnv==2020.12.3 -Distance==0.1.3 -docker-pycreds==0.4.0 +distlib==0.3.8 +dnspython==2.4.2 +docker==7.0.0 docopt==0.6.2 docstring-parser==0.15 docutils==0.20.1 -editdistance==0.6.2 einops==0.7.0 +email-validator==1.3.1 emoji==2.7.0 env-file==2020.12.3 et-xmlfile==1.1.0 -evaluate==0.4.0 -exceptiongroup==1.1.2 +evaluate==0.4.1 +exceptiongroup==1.2.0 executing==1.2.0 -faiss-cpu==1.7.4 +fastavro==1.4.7 fasteners==0.19 -fastjsonschema==2.19.1 -fasttext==0.9.2 -filelock==3.12.2 +filelock==3.13.1 +find-libpython==0.3.0 flake8==6.1.0 -flash-attn==2.3.4 +flash-attn==2.3.3 Flask==2.2.5 -Flask-RESTful==0.3.10 -fonttools==4.47.2 -frozenlist==1.4.0 -fsspec==2023.6.0 -ftfy==6.1.3 -g2p-en==2.1.0 -gdown==4.7.3 -geniusrise==0.0.16 -gitdb==4.0.11 -GitPython==3.1.41 +Flask-AppBuilder==4.3.10 +Flask-Babel==2.0.0 +Flask-Caching==2.1.0 +Flask-JWT-Extended==4.6.0 +Flask-Limiter==3.5.0 +Flask-Login==0.6.3 +Flask-Session==0.5.0 +Flask-SQLAlchemy==2.5.1 +Flask-WTF==1.2.1 +frozenlist==1.4.1 +geniusrise==0.0.33 google-auth==2.17.3 -google-auth-oauthlib==1.2.0 +google-re2==1.1 +googleapis-common-protos==1.62.0 GPUtil==1.4.0 +graphviz==0.20.1 +greenlet==3.0.2 grpcio==1.60.0 -h5py==3.10.0 -huggingface-hub==0.20.2 +gunicorn==21.2.0 +h11==0.14.0 +hdfs==2.7.2 +httpcore==0.16.3 +httplib2==0.20.4 +httpx==0.23.3 +huggingface-hub==0.20.1 humanfriendly==10.0 -hydra-core==1.3.2 -idna==3.4 -ijson==3.2.3 -imagesize==1.4.1 -importlib-metadata==6.8.0 +idna==3.6 +importlib-metadata==6.11.0 +importlib-resources==6.1.1 inflect==7.0.0 +inflection==0.5.1 iniconfig==2.0.0 +inspecta==0.1.3 ipython==8.15.0 -ipywidgets==8.1.1 -isort==5.13.2 itsdangerous==2.1.2 jaraco.classes==3.3.0 jaraco.collections==4.3.0 @@ -108,58 +135,46 @@ jaraco.functools==3.9.0 jaraco.text==3.11.1 jedi==0.19.0 jeepney==0.8.0 -jieba==0.42.1 Jinja2==3.1.2 -jiwer==2.5.2 jmespath==0.10.0 joblib==1.3.2 +jsonlines==4.0.0 jsonpickle==3.0.1 -jsonschema==4.21.0 -jsonschema-specifications==2023.12.1 -jupyter_core==5.7.1 -jupyterlab-widgets==3.0.9 +jsonschema==4.20.0 +jsonschema-specifications==2023.11.2 kafka-python==2.0.2 -kaldi-python-io==1.2.2 -kaldiio==2.18.0 keyring==24.2.0 -kiwisolver==1.4.5 -kornia==0.7.1 -kubernetes==27.2.0 -latexcodec==2.0.1 -lazy_loader==0.3 -Levenshtein==0.22.0 -librosa==0.10.1 -lightning-utilities==0.9.0 +kubernetes==28.1.0 +lazy-object-proxy==1.10.0 +limits==3.7.0 +linkify-it-py==2.0.2 lit==16.0.6 -llvmlite==0.41.1 -loguru==0.7.2 +lm_eval==0.4.0 +lockfile==0.12.2 lxml==4.9.3 -Markdown==3.5.2 +Mako==1.3.0 +Markdown==3.5.1 markdown-it-py==3.0.0 -markdown2==2.4.12 MarkupSafe==2.1.3 -marshmallow==3.20.2 -matplotlib==3.8.2 +marshmallow==3.20.1 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.26.1 matplotlib-inline==0.1.6 +mbstrdecoder==1.1.3 mccabe==0.7.0 +mdit-py-plugins==0.4.0 mdurl==0.1.2 -megatron-core==0.4.0 more-itertools==10.1.0 mpmath==1.3.0 -msgpack==1.0.7 multidict==6.0.4 multiprocess==0.70.15 -mypy==1.5.0 +mypy==1.8.0 mypy-extensions==1.0.0 -nbformat==5.9.2 -nemo-text-processing==0.2.2rc0 -nemo_toolkit==1.22.0 networkx==3.1 ninja==1.11.1.1 nltk==3.8.1 -numba==0.58.1 -numcodecs==0.12.1 -numpy==1.23.5 +numexpr==2.8.8 +numpy==1.21.6 nvidia-cublas-cu11==11.10.3.66 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu11==11.7.101 @@ -184,161 +199,163 @@ nvidia-nvjitlink-cu12==12.3.52 nvidia-nvtx-cu11==11.7.91 nvidia-nvtx-cu12==12.1.105 oauthlib==3.2.2 -omegaconf==2.3.0 -onnx==1.15.0 -OpenCC==1.1.6 +objsize==0.6.1 openpyxl==3.1.2 +opentelemetry-api==1.22.0 +opentelemetry-exporter-otlp==1.22.0 +opentelemetry-exporter-otlp-proto-common==1.22.0 +opentelemetry-exporter-otlp-proto-grpc==1.22.0 +opentelemetry-exporter-otlp-proto-http==1.22.0 +opentelemetry-proto==1.22.0 +opentelemetry-sdk==1.22.0 +opentelemetry-semantic-conventions==0.43b0 optimum==1.13.2 -packaging==23.1 -pandas==2.0.3 -pangu==4.0.6.1 -parameterized==0.9.0 +ordered-set==4.1.0 +orjson==3.9.7 +packaging==23.2 +pandas==1.3.5 parso==0.8.3 -pathspec==0.11.2 +pathspec==0.12.1 +pathvalidate==3.2.0 peft==0.5.0 +pemja==0.3.0 +pendulum==2.1.2 pexpect==4.8.0 pickleshare==0.7.5 Pillow==10.1.0 +pip-autoremove==0.10.0 +pipdeptree==2.13.1 pkginfo==1.9.6 -plac==1.4.2 platformdirs==3.10.0 -pluggy==1.2.0 -pooch==1.8.0 +pluggy==1.3.0 portalocker==2.7.0 portend==3.2.0 -prettytable==3.8.0 -progress==1.6 -prometheus-client==0.17.1 +prettytable==3.9.0 +prison==0.2.1 +prometheus-client==0.19.0 prompt-toolkit==3.0.39 -protobuf==4.23.4 -psutil==5.9.5 -psycopg2==2.9.7 +proto-plus==1.22.3 +protobuf==3.20.3 +psutil==5.9.6 +psycopg2==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 -pyannote.core==5.0.0 -pyannote.database==5.0.1 -pyannote.metrics==3.2.1 -pyarrow==13.0.0 +py4j==0.10.9.7 +pyarrow==8.0.0 +pyarrow-hotfix==0.6 pyasn1==0.5.0 pyasn1-modules==0.3.0 pybind11==2.11.1 -pybtex==0.24.0 -pybtex-docutils==1.0.3 pycodestyle==2.11.0 pycparser==2.21 -pydantic==1.10.13 -pydantic_core==2.4.0 -pydub==0.25.1 +pydantic==2.5.2 +pydantic_core==2.14.5 +pydot==1.4.2 pyflakes==3.1.0 -Pygments==2.16.1 -pynini==2.1.5 +Pygments==2.17.2 +PyJWT==2.8.0 +pymongo==3.13.0 pyparsing==3.1.1 -pypinyin==0.50.0 -pypinyin-dict==0.7.0 +pyproject-api==1.6.1 pyproject_hooks==1.0.0 -PySocks==1.7.1 -pytest==7.4.0 +pyspark==3.5.0 +pytablewriter==1.2.0 +pytest==7.4.3 +pytest-asyncio==0.21.1 pytest-cov==4.1.0 -pytest-runner==6.0.1 +python-daemon==3.0.1 python-dateutil==2.8.2 -pytorch-lightning==2.0.7 -pytz==2023.3 +python-dotenv==1.0.0 +python-nvd3==0.15.0 +python-slugify==8.0.1 +pytz==2023.3.post1 +pytzdata==2020.1 PyYAML==6.0.1 -rapidfuzz==2.13.7 readme-renderer==40.0 -redis==4.6.0 -referencing==0.32.1 +redis==5.0.1 +referencing==0.32.0 regex==2023.8.8 requests==2.31.0 requests-oauthlib==1.3.1 requests-toolbelt==1.0.0 -resampy==0.4.2 responses==0.18.0 retrying==1.3.4 -rfc3986==2.0.0 -rich==13.5.2 -rich-argparse==1.3.0 +rfc3339-validator==0.1.4 +rfc3986==1.5.0 +rich==13.7.0 +rich-argparse==1.4.0 +rootpath==0.1.1 rouge==1.0.1 rouge-score==0.1.2 -rpds-py==0.17.1 +rpds-py==0.13.2 rsa==4.9 -ruamel.yaml==0.18.5 -ruamel.yaml.clib==0.2.8 -s3transfer==0.6.1 +s3transfer==0.10.0 sacrebleu==2.3.1 sacremoses==0.0.53 safetensors==0.3.3 scikit-learn==1.3.0 scipy==1.11.2 -seaborn==0.13.1 SecretStorage==3.3.3 sentence-transformers==2.2.2 sentencepiece==0.1.99 -sentry-sdk==1.39.2 setproctitle==1.3.3 -shellingham==1.5.4 shortuuid==1.0.11 shtab==1.6.4 six==1.16.0 -smmap==5.0.1 -snowballstemmer==2.2.0 -sortedcontainers==2.4.0 -soundfile==0.12.1 -soupsieve==2.5 -sox==1.4.1 -soxr==0.3.7 -Sphinx==7.2.6 -sphinxcontrib-applehelp==1.0.8 -sphinxcontrib-bibtex==2.6.2 -sphinxcontrib-devhelp==1.0.6 -sphinxcontrib-htmlhelp==2.0.5 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.7 -sphinxcontrib-serializinghtml==1.1.10 +sniffio==1.3.0 +spark==0.2.1 +SQLAlchemy==1.4.50 +SQLAlchemy-JSONField==1.0.2 +SQLAlchemy-Utils==0.41.1 +sqlitedict==2.1.0 +sqlparse==0.4.4 stack-data==0.6.2 +streamz==0.6.4 sympy==1.12 +tabledata==1.3.3 tabulate==0.9.0 +tcolorpy==0.1.4 tempora==5.5.0 -tensorboard==2.15.1 -tensorboard-data-server==0.7.2 -tensorstore==0.1.45 -termcolor==2.3.0 +tenacity==8.2.3 +termcolor==2.4.0 text-unidecode==1.3 -textdistance==4.6.1 -texterrors==0.4.4 +texttable==1.7.0 threadpoolctl==3.2.0 tokenizers==0.15.0 toml==0.10.2 tomli==2.0.1 -torch==2.1.0 -torchmetrics==1.3.0.post0 +toolz==0.12.0 +torch==2.1.2 torchvision==0.16.0 +tornado==6.3.3 +tox==4.11.4 tqdm==4.66.1 +tqdm-multiprocess==0.0.11 traitlets==5.9.0 transformers==4.36.2 triton==2.1.0 trl==0.7.2 twine==4.0.2 -typed-ast==1.5.5 -typer==0.9.0 +typepy==1.3.2 types-PyYAML==6.0.12.11 -typing_extensions==4.7.1 +typing_extensions==4.9.0 tyro==0.5.10 tzdata==2023.3 +uc-micro-py==1.0.2 +unicodecsv==0.14.1 +universal-pathlib==0.1.4 urllib3==1.26.16 values==2020.12.3 -wandb==0.16.2 -wcwidth==0.2.13 -webdataset==0.1.62 +virtualenv==20.25.0 +wcwidth==0.2.6 webencodings==0.5.1 websocket-client==1.6.1 -Werkzeug==3.0.1 -wget==3.2 -widgetsnbextension==4.0.9 +Werkzeug==2.2.3 wrapt==1.16.0 +WTForms==3.1.1 xxhash==3.3.0 -yarl==1.9.2 -youtokentome==1.0.6 -zarr==2.16.1 +yarl==1.9.4 zc.lockfile==3.0.post1 -zipp==3.16.2 +zict==3.0.0 +zipp==3.17.0 +zstandard==0.21.0 From a31430b1726ab754f453bf12c4c048a762cc9189 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Fri, 19 Jan 2024 17:56:01 +0530 Subject: [PATCH 3/4] fixes --- geniusrise_text/notebook/notebook.py | 41 ++++++++++------------------ 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/geniusrise_text/notebook/notebook.py b/geniusrise_text/notebook/notebook.py index 9f21d0c..b562418 100644 --- a/geniusrise_text/notebook/notebook.py +++ b/geniusrise_text/notebook/notebook.py @@ -130,32 +130,24 @@ def create( self.install_packages( [ - "numpy==1.21.6", - "scikit-learn==1.3.0", - "pandas==1.3.5", - "matplotlib-inline==0.1.6", - "seaborn==0.13.1", - "torch==2.1.2", - "tensorflow==2.15.0", - "transformers", - "datasets", - "evaluate", - "diffusers", - "nemo_toolkit[all]", "jupyterthemes", "jupyter==1.0.0", + "jupyterlab_legos_ui", + "jupyterlab_darkside_ui", + "theme-darcula", + "jupyter_contrib_nbextensions", ] ) # self.install_jupyter_extensions( # [ - # "jupyter_contrib_nbextensions", - # "jupyter_nbextensions_configurator", - # "jupyter_tensorboard", + # # "jupyter_contrib_nbextensions", + # # "jupyter_nbextensions_configurator", + # "@yeebc/jupyterlab_neon_theme", + # "@yudai-nkt/jupyterlab_city-lights-theme", # "rise", # "nbdime", # ] # ) - self.enable_jupyter_dark_theme() self.start_jupyter_server(notebook_dir=output_path, port=port, password=password) @@ -192,15 +184,17 @@ def start_jupyter_server(self, notebook_dir: str, port: int = 8888, password: Op command = [ "jupyter", - "notebook", - "--password", - password, + "lab", + # f"--ServerApp.password=''", + "--ip=0.0.0.0", + f"--ServerApp.token={password}", "--no-browser", "--port", str(port), - "--notebook-dir", + "--ServerApp.root_dir", notebook_dir, ] + self.log.info(f"Running command {' '.join(command)}") subprocess.run(command, check=True) # type: ignore @@ -226,10 +220,3 @@ def install_jupyter_extensions(self, extensions: List[str]): subprocess.run(["jupyter", "nbextension", "install", extension, "--user"], check=True) subprocess.run(["jupyter", "nbextension", "enable", extension, "--user"], check=True) self.log.info("Jupyter extensions installed and enabled.") - - def enable_jupyter_dark_theme(self): - """ - Enable dark theme for Jupyter Notebook. - """ - subprocess.run(["jt", "-t", "onedork"], check=True) # Example: using 'onedork' theme from jt (jupyterthemes) - self.log.info("Jupyter dark theme enabled.") From ff12828d7da9c811a06ab859966435de9450e0e7 Mon Sep 17 00:00:00 2001 From: ixaxaar Date: Fri, 19 Jan 2024 18:04:08 +0530 Subject: [PATCH 4/4] fixes --- geniusrise_text/notebook/notebook.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/geniusrise_text/notebook/notebook.py b/geniusrise_text/notebook/notebook.py index b562418..b3b47e6 100644 --- a/geniusrise_text/notebook/notebook.py +++ b/geniusrise_text/notebook/notebook.py @@ -135,12 +135,15 @@ def create( "jupyterlab_legos_ui", "jupyterlab_darkside_ui", "theme-darcula", - "jupyter_contrib_nbextensions", + # "notebook==6.4.12", + # "jupyter_contrib_nbextensions", ] ) + + # subprocess.run("jupyter contrib nbextension install --user".split(" "), check=True) + # self.install_jupyter_extensions( # [ - # # "jupyter_contrib_nbextensions", # # "jupyter_nbextensions_configurator", # "@yeebc/jupyterlab_neon_theme", # "@yudai-nkt/jupyterlab_city-lights-theme",