From a34692dffcd5ae57b39a3e2aa0bd58ba6fb24fcb Mon Sep 17 00:00:00 2001 From: Souradip Pal Date: Fri, 8 Nov 2024 01:53:32 -0600 Subject: [PATCH] Fixed utils test. --- Makefile | 2 +- doc_generator/main.py | 4 +- doc_generator/query/create_chat_chain.py | 13 +- doc_generator/utils/llm_utils.py | 4 +- tests/utils/test_file_utils.py | 75 +++++++++-- tests/utils/test_llm_utils.py | 162 ++++++++++++++++------- 6 files changed, 192 insertions(+), 68 deletions(-) diff --git a/Makefile b/Makefile index 1a1b97c..de71448 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ lint: ## Run pep8, black, mypy linters. $(ENV_PREFIX)flake8 doc_generator/ $(ENV_PREFIX)black -l 79 --check doc_generator/ $(ENV_PREFIX)black -l 79 --check tests/ - $(ENV_PREFIX)mypy --ignore-missing-imports doc_generator/ + $(ENV_PREFIX)mypy --ignore-missing-imports --disable-error-code=arg-type doc_generator/ .PHONY: test test: lint ## Run tests and generate coverage report. diff --git a/doc_generator/main.py b/doc_generator/main.py index 78919ee..6b593a7 100644 --- a/doc_generator/main.py +++ b/doc_generator/main.py @@ -36,8 +36,8 @@ def url_validator(x): default=f"./{name}/", ).ask() project_url = questionary.text( - message="Project URL?[Example: \ - https://github.com/username/doc_generator]", + message="Project URL?[Example: " + + "https://github.com/username/doc_generator]", validate=url_validator, ).ask() output_dir = questionary.path( diff --git a/doc_generator/query/create_chat_chain.py b/doc_generator/query/create_chat_chain.py index dfe22fa..8c1f77b 100644 --- a/doc_generator/query/create_chat_chain.py +++ b/doc_generator/query/create_chat_chain.py @@ -4,8 +4,11 @@ from typing import List from langchain.chains.conversational_retrieval.base import ChatVectorDBChain -from langchain.chains import LLMChain, StuffDocumentsChain +from langchain.chains import LLMChain from langchain.chains import create_retrieval_chain +from langchain.chains.combine_documents.stuff import ( + create_stuff_documents_chain, +) from langchain.prompts import PromptTemplate from doc_generator.types import LLMModels from doc_generator.utils.llm_utils import ( @@ -204,8 +207,8 @@ def make_qa_chain( chat_prompt, target_audience, ) - doc_chain = StuffDocumentsChain( - llm_chain=doc_chat_model, document_prompt=qa_prompt + doc_chain = create_stuff_documents_chain( + llm=doc_chat_model, prompt=qa_prompt ) return ChatVectorDBChain( @@ -268,8 +271,8 @@ def make_readme_chain( chat_prompt, target_audience, ) - doc_chain = StuffDocumentsChain( - llm_chain=doc_chat_model, document_prompt=readme_prompt + doc_chain = create_stuff_documents_chain( + llm=doc_chat_model, prompt=readme_prompt ) return create_retrieval_chain( diff --git a/doc_generator/utils/llm_utils.py b/doc_generator/utils/llm_utils.py index 3b031bb..ce06595 100644 --- a/doc_generator/utils/llm_utils.py +++ b/doc_generator/utils/llm_utils.py @@ -339,9 +339,9 @@ def print_model_details(models): "File Count": model_details.total, "Succeeded": model_details.succeeded, "Failed": model_details.failed, - "Tokens": model_details.inputTokens + model_details.output_tokens, + "Tokens": model_details.input_tokens + model_details.output_tokens, "Cost": ( - (model_details.inputTokens / 1000) + (model_details.input_tokens / 1000) * model_details.input_cost_per_1k_tokens + (model_details.output_tokens / 1000) * model_details.output_cost_per_1k_tokens diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py index 5933294..d38b360 100644 --- a/tests/utils/test_file_utils.py +++ b/tests/utils/test_file_utils.py @@ -1,52 +1,74 @@ import pytest -from doc_generator.utils.file_utils import get_file_name, github_file_url, github_folder_url +from doc_generator.utils.file_utils import ( + get_file_name, + github_file_url, + github_folder_url, +) + def test_get_file_name_with_delimiter(): assert get_file_name("example.txt") == "example.md" + def test_get_file_name_without_delimiter(): assert get_file_name("example") == "example.md" + def test_get_file_name_custom_delimiter(): assert get_file_name("example-text", delimiter="-") == "example.md" + def test_get_file_name_no_delimiter_custom_extension(): assert get_file_name("example", extension=".txt") == "example.txt" + def test_get_file_name_with_delimiter_custom_extension(): assert get_file_name("example.txt", extension=".txt") == "example.txt" + def test_get_file_name_with_multiple_delimiters(): assert get_file_name("my.example.txt") == "my.example.md" + def test_get_file_name_with_no_delimiter_and_no_extension(): assert get_file_name("example", extension="") == "example" + def test_get_file_name_with_delimiter_and_no_extension(): assert get_file_name("example.txt", extension="") == "example" + def test_get_file_name_empty_input(): assert get_file_name("") == ".md" + def test_get_file_name_delimiter_not_in_input(): assert get_file_name("example", delimiter="/") == "example.md" + def test_get_file_name_delimiter_at_end(): assert get_file_name("example.", delimiter=".") == "example.md" + def test_get_file_name_delimiter_at_start(): assert get_file_name(".example", delimiter=".") == ".md" + def test_get_file_name_delimiter_multiple_occurrences(): assert get_file_name("my.file.name.txt") == "my.file.name.md" + def test_github_file_url_link_hosted_true(): github_root = "https://github.com/user/repo" input_root = "/home/user/project" file_path = "/home/user/project/docs/file.md" link_hosted = True expected_url = f"{github_root}/{file_path[len(input_root)-1:]}" - assert github_file_url(github_root, input_root, file_path, link_hosted) == expected_url + assert ( + github_file_url(github_root, input_root, file_path, link_hosted) + == expected_url + ) + def test_github_file_url_link_hosted_false(): github_root = "https://github.com/user/repo" @@ -54,7 +76,11 @@ def test_github_file_url_link_hosted_false(): file_path = "/home/user/project/docs/file.md" link_hosted = False expected_url = f"{github_root}/blob/master/{file_path[len(input_root)-1:]}" - assert github_file_url(github_root, input_root, file_path, link_hosted) == expected_url + assert ( + github_file_url(github_root, input_root, file_path, link_hosted) + == expected_url + ) + def test_github_file_url_empty_input_root(): github_root = "https://github.com/user/repo" @@ -62,7 +88,11 @@ def test_github_file_url_empty_input_root(): file_path = "/docs/file.md" link_hosted = False expected_url = f"{github_root}/blob/master/{file_path[-1:]}" - assert github_file_url(github_root, input_root, file_path, link_hosted) == expected_url + assert ( + github_file_url(github_root, input_root, file_path, link_hosted) + == expected_url + ) + def test_github_file_url_empty_file_path(): github_root = "https://github.com/user/repo" @@ -70,7 +100,11 @@ def test_github_file_url_empty_file_path(): file_path = "" link_hosted = False expected_url = f"{github_root}/blob/master/{file_path[len(input_root)-1:]}" - assert github_file_url(github_root, input_root, file_path, link_hosted) == expected_url + assert ( + github_file_url(github_root, input_root, file_path, link_hosted) + == expected_url + ) + def test_github_folder_url_link_hosted_true(): github_root = "https://github.com/user/repo" @@ -78,15 +112,25 @@ def test_github_folder_url_link_hosted_true(): folder_path = "/home/user/project/docs/" link_hosted = True expected_url = f"{github_root}/{folder_path[len(input_root)-1:]}" - assert github_folder_url(github_root, input_root, folder_path, link_hosted) == expected_url + assert ( + github_folder_url(github_root, input_root, folder_path, link_hosted) + == expected_url + ) + def test_github_folder_url_link_hosted_false(): github_root = "https://github.com/user/repo" input_root = "/home/user/project" folder_path = "/home/user/project/docs/" link_hosted = False - expected_url = f"{github_root}/tree/master/{folder_path[len(input_root)-1:]}" - assert github_folder_url(github_root, input_root, folder_path, link_hosted) == expected_url + expected_url = ( + f"{github_root}/tree/master/{folder_path[len(input_root)-1:]}" + ) + assert ( + github_folder_url(github_root, input_root, folder_path, link_hosted) + == expected_url + ) + def test_github_folder_url_empty_input_root(): github_root = "https://github.com/user/repo" @@ -94,12 +138,21 @@ def test_github_folder_url_empty_input_root(): folder_path = "/docs/" link_hosted = False expected_url = f"{github_root}/tree/master/{folder_path[-1:]}" - assert github_folder_url(github_root, input_root, folder_path, link_hosted) == expected_url + assert ( + github_folder_url(github_root, input_root, folder_path, link_hosted) + == expected_url + ) + def test_github_folder_url_empty_folder_path(): github_root = "https://github.com/user/repo" input_root = "/home/user/project" folder_path = "" link_hosted = False - expected_url = f"{github_root}/tree/master/{folder_path[len(input_root)-1:]}" - assert github_folder_url(github_root, input_root, folder_path, link_hosted) == expected_url + expected_url = ( + f"{github_root}/tree/master/{folder_path[len(input_root)-1:]}" + ) + assert ( + github_folder_url(github_root, input_root, folder_path, link_hosted) + == expected_url + ) diff --git a/tests/utils/test_llm_utils.py b/tests/utils/test_llm_utils.py index 8452a8c..b2cc1fe 100644 --- a/tests/utils/test_llm_utils.py +++ b/tests/utils/test_llm_utils.py @@ -24,13 +24,21 @@ def test_get_gemma_chat_model_with_peft(): "device": "cpu", "peft_model_path": "path/to/peft/model", } - with patch("doc_generator.utils.llm_utils.hf_hub_download") as mock_hf_download, \ - patch("doc_generator.utils.llm_utils.get_tokenizer") as mock_get_tokenizer, \ - patch("doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained") as mock_auto_model, \ - patch("doc_generator.utils.llm_utils.PeftModel.from_pretrained") as mock_peft_model, \ - patch("doc_generator.utils.llm_utils.pipeline") as mock_pipeline, \ - patch("doc_generator.utils.llm_utils.HuggingFacePipeline") as mock_hf_pipeline, \ - patch.dict(os.environ, {"HF_TOKEN": "test_token"}): + with patch( + "doc_generator.utils.llm_utils.hf_hub_download" + ) as mock_hf_download, patch( + "doc_generator.utils.llm_utils.get_tokenizer" + ) as mock_get_tokenizer, patch( + "doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained" + ) as mock_auto_model, patch( + "doc_generator.utils.llm_utils.PeftModel.from_pretrained" + ) as mock_peft_model, patch( + "doc_generator.utils.llm_utils.pipeline" + ) as mock_pipeline, patch( + "doc_generator.utils.llm_utils.HuggingFacePipeline" + ) as mock_hf_pipeline, patch.dict( + os.environ, {"HF_TOKEN": "test_token"} + ): mock_tokenizer = MagicMock() mock_get_tokenizer.return_value = mock_tokenizer @@ -49,15 +57,21 @@ def test_get_gemma_chat_model_with_peft(): result = get_gemma_chat_model(model_name, model_kwargs=model_kwargs) - mock_hf_download.assert_called_once_with(model_name, model_kwargs["gguf_file"]) - mock_get_tokenizer.assert_called_once_with(model_name, model_kwargs["gguf_file"]) + mock_hf_download.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) + mock_get_tokenizer.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) mock_auto_model.assert_called_once_with( model_name, gguf_file=model_kwargs["gguf_file"], device_map=model_kwargs["device"], token="test_token", ) - mock_peft_model.assert_called_once_with(mock_model, model_kwargs["peft_model_path"]) + mock_peft_model.assert_called_once_with( + mock_model, model_kwargs["peft_model_path"] + ) mock_pipeline.assert_called_once() mock_hf_pipeline.assert_called_once_with( pipeline=mock_pipeline_instance, model_kwargs=model_kwargs @@ -71,13 +85,21 @@ def test_get_gemma_chat_model_without_peft(): "gguf_file": "some_file.gguf", "device": "cpu", } - with patch("doc_generator.utils.llm_utils.hf_hub_download") as mock_hf_download, \ - patch("doc_generator.utils.llm_utils.get_tokenizer") as mock_get_tokenizer, \ - patch("doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained") as mock_auto_model, \ - patch("doc_generator.utils.llm_utils.PeftModel.from_pretrained") as mock_peft_model, \ - patch("doc_generator.utils.llm_utils.pipeline") as mock_pipeline, \ - patch("doc_generator.utils.llm_utils.HuggingFacePipeline") as mock_hf_pipeline, \ - patch.dict(os.environ, {"HF_TOKEN": "test_token"}): + with patch( + "doc_generator.utils.llm_utils.hf_hub_download" + ) as mock_hf_download, patch( + "doc_generator.utils.llm_utils.get_tokenizer" + ) as mock_get_tokenizer, patch( + "doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained" + ) as mock_auto_model, patch( + "doc_generator.utils.llm_utils.PeftModel.from_pretrained" + ) as mock_peft_model, patch( + "doc_generator.utils.llm_utils.pipeline" + ) as mock_pipeline, patch( + "doc_generator.utils.llm_utils.HuggingFacePipeline" + ) as mock_hf_pipeline, patch.dict( + os.environ, {"HF_TOKEN": "test_token"} + ): mock_tokenizer = MagicMock() mock_get_tokenizer.return_value = mock_tokenizer @@ -93,8 +115,12 @@ def test_get_gemma_chat_model_without_peft(): result = get_gemma_chat_model(model_name, model_kwargs=model_kwargs) - mock_hf_download.assert_called_once_with(model_name, model_kwargs["gguf_file"]) - mock_get_tokenizer.assert_called_once_with(model_name, model_kwargs["gguf_file"]) + mock_hf_download.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) + mock_get_tokenizer.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) mock_auto_model.assert_called_once_with( model_name, gguf_file=model_kwargs["gguf_file"], @@ -116,13 +142,21 @@ def test_get_llama_chat_model_with_peft(): "device": "cpu", "peft_model": "path/to/peft/model", } - with patch("doc_generator.utils.llm_utils.hf_hub_download") as mock_hf_download, \ - patch("doc_generator.utils.llm_utils.get_tokenizer") as mock_get_tokenizer, \ - patch("doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained") as mock_auto_model, \ - patch("doc_generator.utils.llm_utils.PeftModel.from_pretrained") as mock_peft_model, \ - patch("doc_generator.utils.llm_utils.pipeline") as mock_pipeline, \ - patch("doc_generator.utils.llm_utils.HuggingFacePipeline") as mock_hf_pipeline, \ - patch.dict(os.environ, {"HF_TOKEN": "test_token"}): + with patch( + "doc_generator.utils.llm_utils.hf_hub_download" + ) as mock_hf_download, patch( + "doc_generator.utils.llm_utils.get_tokenizer" + ) as mock_get_tokenizer, patch( + "doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained" + ) as mock_auto_model, patch( + "doc_generator.utils.llm_utils.PeftModel.from_pretrained" + ) as mock_peft_model, patch( + "doc_generator.utils.llm_utils.pipeline" + ) as mock_pipeline, patch( + "doc_generator.utils.llm_utils.HuggingFacePipeline" + ) as mock_hf_pipeline, patch.dict( + os.environ, {"HF_TOKEN": "test_token"} + ): mock_tokenizer = MagicMock() mock_get_tokenizer.return_value = mock_tokenizer @@ -143,15 +177,21 @@ def test_get_llama_chat_model_with_peft(): result = get_llama_chat_model(model_name, model_kwargs=model_kwargs) - mock_hf_download.assert_called_once_with(model_name, model_kwargs["gguf_file"]) - mock_get_tokenizer.assert_called_once_with(model_name, model_kwargs["gguf_file"]) + mock_hf_download.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) + mock_get_tokenizer.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) assert mock_tokenizer.pad_token == mock_tokenizer.eos_token mock_auto_model.assert_called_once_with( model_name, gguf_file=model_kwargs["gguf_file"], device_map=model_kwargs["device"], ) - mock_peft_model.assert_called_once_with(mock_model, model_kwargs["peft_model"]) + mock_peft_model.assert_called_once_with( + mock_model, model_kwargs["peft_model"] + ) mock_pipeline.assert_called_once() mock_hf_pipeline.assert_called_once_with( pipeline=mock_pipeline_instance, model_kwargs=model_kwargs @@ -165,13 +205,21 @@ def test_get_llama_chat_model_without_peft(): "gguf_file": "some_file.gguf", "device": "cpu", } - with patch("doc_generator.utils.llm_utils.hf_hub_download") as mock_hf_download, \ - patch("doc_generator.utils.llm_utils.get_tokenizer") as mock_get_tokenizer, \ - patch("doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained") as mock_auto_model, \ - patch("doc_generator.utils.llm_utils.PeftModel.from_pretrained") as mock_peft_model, \ - patch("doc_generator.utils.llm_utils.pipeline") as mock_pipeline, \ - patch("doc_generator.utils.llm_utils.HuggingFacePipeline") as mock_hf_pipeline, \ - patch.dict(os.environ, {"HF_TOKEN": "test_token"}): + with patch( + "doc_generator.utils.llm_utils.hf_hub_download" + ) as mock_hf_download, patch( + "doc_generator.utils.llm_utils.get_tokenizer" + ) as mock_get_tokenizer, patch( + "doc_generator.utils.llm_utils.AutoModelForCausalLM.from_pretrained" + ) as mock_auto_model, patch( + "doc_generator.utils.llm_utils.PeftModel.from_pretrained" + ) as mock_peft_model, patch( + "doc_generator.utils.llm_utils.pipeline" + ) as mock_pipeline, patch( + "doc_generator.utils.llm_utils.HuggingFacePipeline" + ) as mock_hf_pipeline, patch.dict( + os.environ, {"HF_TOKEN": "test_token"} + ): mock_tokenizer = MagicMock() mock_get_tokenizer.return_value = mock_tokenizer @@ -189,8 +237,12 @@ def test_get_llama_chat_model_without_peft(): result = get_llama_chat_model(model_name, model_kwargs=model_kwargs) - mock_hf_download.assert_called_once_with(model_name, model_kwargs["gguf_file"]) - mock_get_tokenizer.assert_called_once_with(model_name, model_kwargs["gguf_file"]) + mock_hf_download.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) + mock_get_tokenizer.assert_called_once_with( + model_name, model_kwargs["gguf_file"] + ) assert mock_tokenizer.pad_token == mock_tokenizer.eos_token mock_auto_model.assert_called_once_with( model_name, @@ -235,7 +287,9 @@ def test_get_openai_api_key_not_set(monkeypatch): def test_get_tokenizer_with_hf_token(monkeypatch): model_name = "some-model" gguf_file = "some_file.gguf" - with patch("doc_generator.utils.llm_utils.AutoTokenizer.from_pretrained") as mock_from_pretrained: + with patch( + "doc_generator.utils.llm_utils.AutoTokenizer.from_pretrained" + ) as mock_from_pretrained: mock_tokenizer = MagicMock() mock_from_pretrained.return_value = mock_tokenizer @@ -253,7 +307,9 @@ def test_get_tokenizer_with_hf_token(monkeypatch): def test_get_tokenizer_without_hf_token(monkeypatch): model_name = "some-model" gguf_file = "some_file.gguf" - with patch("doc_generator.utils.llm_utils.AutoTokenizer.from_pretrained") as mock_from_pretrained: + with patch( + "doc_generator.utils.llm_utils.AutoTokenizer.from_pretrained" + ) as mock_from_pretrained: monkeypatch.delenv("HF_TOKEN", raising=False) with pytest.raises(KeyError): get_tokenizer(model_name, gguf_file) @@ -295,8 +351,12 @@ def test_print_model_details(capsys): expected_outputs = [] for model_details in test_models.values(): tokens = model_details.input_tokens + model_details.output_tokens - cost = ((model_details.input_tokens / 1000) * model_details.input_cost_per_1k_tokens) + ( - (model_details.output_tokens / 1000) * model_details.output_cost_per_1k_tokens + cost = ( + (model_details.input_tokens / 1000) + * model_details.input_cost_per_1k_tokens + ) + ( + (model_details.output_tokens / 1000) + * model_details.output_cost_per_1k_tokens ) result = { "Model": model_details.name, @@ -329,7 +389,9 @@ def test_print_model_details_empty(capsys): print_model_details(test_models) captured = capsys.readouterr() output_lines = captured.out.strip().split("\n") - assert output_lines == ["{'Model': 'Total', 'File Count': 0, 'Succeeded': 0, 'Failed': 0, 'Tokens': 0, 'Cost': 0}"] + assert output_lines == [ + "{'Model': 'Total', 'File Count': 0, 'Succeeded': 0, 'Failed': 0, 'Tokens': 0, 'Cost': 0}" + ] def test_total_index_cost_estimate(): @@ -364,8 +426,10 @@ def test_total_index_cost_estimate(): total_cost = total_index_cost_estimate(None) expected_cost = sum( - (model_details.input_tokens / 1000) * model_details.input_cost_per_1k_tokens - + (model_details.output_tokens / 1000) * model_details.output_cost_per_1k_tokens + (model_details.input_tokens / 1000) + * model_details.input_cost_per_1k_tokens + + (model_details.output_tokens / 1000) + * model_details.output_cost_per_1k_tokens for model_details in test_models.values() ) assert total_cost == expected_cost @@ -375,7 +439,9 @@ def test_get_embeddings_llama_model(): model = "llama-something" device = "cpu" - with patch("doc_generator.utils.llm_utils.HuggingFaceEmbeddings") as mock_hf_embeddings: + with patch( + "doc_generator.utils.llm_utils.HuggingFaceEmbeddings" + ) as mock_hf_embeddings: embeddings = get_embeddings(model, device) mock_hf_embeddings.assert_called_once_with( model_name="sentence-transformers/all-mpnet-base-v2", @@ -389,7 +455,9 @@ def test_get_embeddings_non_llama_model(): model = "gpt-3.5-turbo" device = "cpu" - with patch("doc_generator.utils.llm_utils.OpenAIEmbeddings") as mock_openai_embeddings: + with patch( + "doc_generator.utils.llm_utils.OpenAIEmbeddings" + ) as mock_openai_embeddings: embeddings = get_embeddings(model, device) mock_openai_embeddings.assert_called_once_with() assert embeddings == mock_openai_embeddings.return_value