From 48ecec89da6da228b810f259836cd092e4f7fd9b Mon Sep 17 00:00:00 2001
From: Eli <egsalamie@gmail.com>
Date: Sun, 27 Oct 2024 03:11:43 -0500
Subject: [PATCH] Fix a few small bugs.

---
 README.md                                     |   2 +-
 .../img/headers/{ascii.png => ascii-art.png}  | Bin
 docs/docs/examples/gallery.md                 |   1 +
 pyproject.toml                                |   3 +-
 readmeai/config/settings.py                   |   5 +
 readmeai/config/settings/prompts.toml         |  14 +--
 readmeai/config/settings/tool_config.toml     |   6 +-
 readmeai/generators/quickstart.py             |  97 +++++++++++++-----
 readmeai/preprocessor/document_cleaner.py     |  85 ++++++++++++---
 tests/generators/test_quickstart.py           |   8 +-
 tests/preprocessor/test_document_cleaner.py   |  94 +++++++++++++++++
 11 files changed, 256 insertions(+), 59 deletions(-)
 rename docs/docs/assets/img/headers/{ascii.png => ascii-art.png} (100%)
diff --git a/README.md b/README.md
index 9b21e612..bb7ea641 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ Let's take a look at some possible customizations created by readme-ai:
   <!-- ROW -->
   <tr>
     <td colspan="2" align="center"><br>
-      <img src="https://raw.githubusercontent.com/eli64s/readme-ai/main/docs/docs/assets/img/headers/ascii.png" alt="ascii-readme-header-style" width="700">
+      <img src="https://raw.githubusercontent.com/eli64s/readme-ai/main/docs/docs/assets/img/headers/ascii-art.png" alt="ascii-readme-header-style" width="700">
       <br>
       <code>--header-style ascii</code>
     </td>
diff --git a/docs/docs/assets/img/headers/ascii.png b/docs/docs/assets/img/headers/ascii-art.png
similarity index 100%
rename from docs/docs/assets/img/headers/ascii.png
rename to docs/docs/assets/img/headers/ascii-art.png
diff --git a/docs/docs/examples/gallery.md b/docs/docs/examples/gallery.md
index aaa8244b..836f2799 100644
--- a/docs/docs/examples/gallery.md
+++ b/docs/docs/examples/gallery.md
@@ -3,6 +3,7 @@ title: Gallery
 ---
 
 Explore various README examples from different programming languages and technologies. Each example showcases a README file from a different repository and project type.
+
 | Technology | Example Output | Repository | Description |
 |------------|---------------|------------|-------------|
 | Readme-ai | [readme-ai.md][default] | [readme-ai][readme-ai] | Readme-ai project |
diff --git a/pyproject.toml b/pyproject.toml
index 4d0f68f8..b94c459f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "readmeai"
-version = "0.5.99.post3"
+version = "0.5.99.post4"
 description = "Automated README file generator, powered by AI."
 authors = ["Eli <egsalamie@gmail.com>"]
 license = "MIT"
@@ -46,6 +46,7 @@ structlog = "^24.4.0"
 tenacity = "^8.2.2"
 tiktoken = "^0.4.0"
 tomli = { version = "*", python = "<3.11" }
+typing-extensions = { version = "*", python = "<3.11" }
 
 anthropic = { version = "*", optional = true }
 google-generativeai = { version = "*", optional = true }
diff --git a/readmeai/config/settings.py b/readmeai/config/settings.py
index 01eddf0a..d028c96f 100644
--- a/readmeai/config/settings.py
+++ b/readmeai/config/settings.py
@@ -36,6 +36,11 @@
 from readmeai.utils.file_handler import FileHandler
 from readmeai.utils.file_resource import get_resource_path
 
+try:
+    from typing import Self
+except ImportError:
+    from typing_extensions import Self
+
 _logger = get_logger(__name__)
 
 
diff --git a/readmeai/config/settings/prompts.toml b/readmeai/config/settings/prompts.toml
index 6d6174c4..777241dd 100644
--- a/readmeai/config/settings/prompts.toml
+++ b/readmeai/config/settings/prompts.toml
@@ -132,19 +132,19 @@ Aim for a clear, engaging description that captures the essence of the project w
 """
 
 slogan = """
-Conceptualize a catchy and memorable slogan for the project: {0} ({1}).
-Your response should synthesize the project's essence, values, or unique selling points into a concise and engaging phrase.
-While generating the slogan, please reference the following codebase details:
+Create a catchy and memorable slogan for the project: {0} ({1}).
+Synthesize the project's essence, values, or unique selling points into a concise and engaging phrase.
 
 <CONTEXT>
 FILE CONTENTS: {2}
 </CONTEXT>
 
 <INSTRUCTIONS>
-- Your response slogan should be 5-8 words long at most.
-- Slogan should be clear, concise and memorable.
-- DO NOT INCLUDE THE PROJECT NAME in the slogan.
+- The slogan must be a single string with no more than 8 words.
+- It should be clear, concise, and memorable.
+- DO NOT INCLUDE the project name in the slogan.
+- RETURN ONLY the slogan without any additional text or explanations.
 </INSTRUCTIONS>
 
-Be creative, think outside the box, and have fun. Cheers!
+Be creative and have fun!
 """
diff --git a/readmeai/config/settings/tool_config.toml b/readmeai/config/settings/tool_config.toml
index aa5f5ee8..c78c04cd 100644
--- a/readmeai/config/settings/tool_config.toml
+++ b/readmeai/config/settings/tool_config.toml
@@ -3,9 +3,9 @@
 # -- Docker --------------------------------------------------------------------
 
 [default]
-install = "❯ echo 'INSERT-INSTALL-COMMAND-HERE'"
-usage = "❯ echo 'INSERT-RUN-COMMAND-HERE'"
-test = "❯ echo 'INSERT-TEST-COMMAND-HERE'"
+install = "echo 'INSERT-INSTALL-COMMAND-HERE'"
+usage = "echo 'INSERT-RUN-COMMAND-HERE'"
+test = "echo 'INSERT-TEST-COMMAND-HERE'"
 
 [containers]
 name = "Docker"
diff --git a/readmeai/generators/quickstart.py b/readmeai/generators/quickstart.py
index 77f0d0cf..d4b82eaa 100644
--- a/readmeai/generators/quickstart.py
+++ b/readmeai/generators/quickstart.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass, field
+from typing import Optional
 
 from readmeai.config.settings import ConfigLoader
 from readmeai.ingestion.models import QuickStart
@@ -31,6 +32,12 @@ def generate(
         """Get any relevant commands for the Quickstart instructions."""
         try:
             primary_language = self._get_primary_language(language_counts)
+
+            if not primary_language:
+                primary_language = (
+                    f"Error detecting primary_language: {language_counts}"
+                )
+
             quickstart = QuickStart(
                 primary_language=primary_language,
                 language_counts=language_counts,
@@ -46,26 +53,48 @@ def generate(
             )
             return QuickStart()
 
-    def _get_primary_language(self, counts: dict[str, int]) -> str | None:
+    def _get_primary_language(self, counts: dict[str, int]) -> Optional[str]:
         """Determine the primary language of the repository."""
-        if not counts:
+        try:
+            if not counts:
+                return None
+
+            # Filter out YAML files and empty counts
+            valid_counts = {
+                k: v
+                for k, v in counts.items()
+                if k not in ("yaml", "yml") and v > 0
+            }
+
+            if not valid_counts:
+                return None
+
+            primary_lang = max(valid_counts, key=valid_counts.get)
+
+            return self.language_names.get(
+                primary_lang, self.language_names.get("default")
+            )
+        except Exception as e:
+            _logger.error(f"Error determining primary language: {e}")
             return None
-        counts = {k: v for k, v in counts.items() if k not in ("yaml", "yml")}
-        primary_lang = max(counts, key=counts.get)
-        return self.language_names.get(
-            primary_lang, self.language_names.get("default")
-        )
 
     def _generate_commands(
         self, quickstart: QuickStart, primary_language: str
     ) -> None:
         """Generate install, usage, and test commands."""
+        if not primary_language:
+            return
+
         command_types = ["install", "usage", "test"]
         tool_types = ["package_managers", "containers"]
+
         for cmd_type in command_types:
             commands: list[str] = []
             for tool_type in tool_types:
-                tools = getattr(quickstart, tool_type)
+                tools = getattr(quickstart, tool_type, {})
+                if not tools:
+                    continue
+
                 commands.extend(
                     filter(
                         None,
@@ -90,28 +119,44 @@ def _format_command(
         file_path: str,
         cmd_type: str,
         tool_type: str,
-    ) -> str | None:
+    ) -> Optional[str]:
         """Format a command for the Quickstart instructions."""
-        config = (
-            self.tools.get(primary_language.lower(), {})
-            .get(tool_type, {})
-            .get(tool_name, {})
-        ) or self.tools.get(tool_type, {}).get(tool_name, {})
-
-        cmd = config.get(cmd_type, self.default_commands.get(cmd_type))
-        if not cmd:
-            return None
+        try:
+            if not primary_language or not tool_name:
+                return None
 
-        if cmd_type == "install" and tool_type == "containers":
-            cmd = cmd.replace("{image_name}", self.config.config.git.full_name)
-        elif cmd_type in {"install", "test"}:
-            cmd = cmd.replace("{file}", file_path)
-        elif cmd_type == "usage":
-            cmd = cmd.replace("{executable}", self.config.config.git.name)
-        return f"""
-**Using `{tool_name}`** &nbsp; [<img align="center" src="{config.get('shield', '')}" />]({config.get('website', '')})
+            lang_key = primary_language.lower()
+            config = (
+                self.tools.get(lang_key, {})
+                .get(tool_type, {})
+                .get(tool_name, {})
+            ) or self.tools.get(tool_type, {}).get(tool_name, {})
+
+            cmd = config.get(cmd_type, self.default_commands.get(cmd_type))
+            if not cmd:
+                return None
+
+            if cmd_type == "install" and tool_type == "containers":
+                cmd = cmd.replace(
+                    "{image_name}", self.config.config.git.full_name or ""
+                )
+            elif cmd_type in {"install", "test"}:
+                cmd = cmd.replace("{file}", file_path or "")
+            elif cmd_type == "usage":
+                cmd = cmd.replace(
+                    "{executable}", self.config.config.git.name or ""
+                )
+
+            shield_url = config.get("shield", "")
+            website_url = config.get("website", "")
+
+            return f"""
+**Using `{tool_name}`** &nbsp; [<img align="center" src="{shield_url}" />]({website_url})
 
 ```sh
 ❯ {cmd}
 ```
 """
+        except Exception as e:
+            _logger.error(f"Error formatting command for {tool_name}: {e}")
+            return None
diff --git a/readmeai/preprocessor/document_cleaner.py b/readmeai/preprocessor/document_cleaner.py
index 65fd3bbb..4c86d61b 100644
--- a/readmeai/preprocessor/document_cleaner.py
+++ b/readmeai/preprocessor/document_cleaner.py
@@ -1,4 +1,5 @@
 import re
+import textwrap
 
 
 class DocumentCleaner:
@@ -12,41 +13,91 @@ def __init__(
         remove_extra_whitespaces: bool = True,
         remove_trailing_whitespaces: bool = True,
         normalize_indentation: bool = True,
+        dedent: bool = False,
     ):
         self.remove_empty_lines = remove_empty_lines
         self.remove_extra_whitespaces = remove_extra_whitespaces
         self.remove_trailing_whitespaces = remove_trailing_whitespaces
         self.normalize_indentation = normalize_indentation
+        self.dedent = dedent
 
     def clean(self, code: str) -> str:
         """Clean the given document string."""
+        lines = code.splitlines()
+
         if self.remove_empty_lines:
-            code = self._remove_empty_lines(code)
-        if self.remove_extra_whitespaces:
-            code = self._remove_extra_whitespaces(code)
+            lines = [line for line in lines if line.strip()]
+
         if self.remove_trailing_whitespaces:
-            code = self._remove_trailing_whitespaces(code)
+            lines = [line.rstrip() for line in lines]
+
         if self.normalize_indentation:
-            code = self._normalize_indentation(code)
-        return code.strip()
+            lines = self._normalize_indentation("\n".join(lines)).splitlines()
 
-    def _remove_empty_lines(self, code: str) -> str:
-        """Remove empty lines and lines with only whitespace."""
-        return "\n".join(line for line in code.splitlines() if line.strip())
+        result = "\n".join(lines)
 
-    def _remove_extra_whitespaces(self, code: str) -> str:
-        """Remove extra whitespaces within lines."""
-        return re.sub(r"\s+", " ", code)
+        if self.dedent:
+            result = textwrap.dedent(result)
 
-    def _remove_trailing_whitespaces(self, code: str) -> str:
-        """Remove trailing whitespaces from each line."""
-        return "\n".join(line.rstrip() for line in code.splitlines())
+        if self.remove_extra_whitespaces:
+            # Only remove extra spaces within each line, preserving leading spaces
+            lines = result.splitlines()
+            lines = [
+                self._preserve_indent_remove_extra_spaces(line)
+                for line in lines
+            ]
+            result = "\n".join(lines)
+
+        return result.rstrip()
+
+    def _preserve_indent_remove_extra_spaces(self, line: str) -> str:
+        """Remove extra whitespaces while preserving leading indentation."""
+        if not line.strip():
+            return ""
+        indent = len(line) - len(line.lstrip())
+        return " " * indent + re.sub(r"\s+", " ", line.lstrip())
 
     def _normalize_indentation(self, code: str) -> str:
         """Normalize indentation to spaces."""
+        if not code:
+            return code
+
         lines = code.splitlines()
         normalized_lines = []
+
         for line in lines:
-            indent = len(line) - len(line.lstrip())
-            normalized_lines.append(" " * indent + line.lstrip())
+            if not line.strip():
+                normalized_lines.append("")
+                continue
+
+            # Calculate leading whitespace count, handling tabs
+            leading_space_count = 0
+            for char in line:
+                if char == " ":
+                    leading_space_count += 1
+                elif char == "\t":
+                    # Round up to the next multiple of 4
+                    leading_space_count = (leading_space_count + 4) & ~3
+                else:
+                    break
+
+            # Preserve the original indentation level
+            normalized_line = " " * leading_space_count + line.lstrip()
+            normalized_lines.append(normalized_line)
+
         return "\n".join(normalized_lines)
+
+    def _remove_empty_lines(self, code: str) -> str:
+        """Remove empty lines and lines with only whitespace."""
+        return "\n".join(line for line in code.splitlines() if line.strip())
+
+    def _remove_extra_whitespaces(self, code: str) -> str:
+        """Remove extra whitespaces within lines while preserving newlines."""
+        lines = code.splitlines()
+        return "\n".join(
+            self._preserve_indent_remove_extra_spaces(line) for line in lines
+        )
+
+    def _remove_trailing_whitespaces(self, code: str) -> str:
+        """Remove trailing whitespaces from each line."""
+        return "\n".join(line.rstrip() for line in code.splitlines())
diff --git a/tests/generators/test_quickstart.py b/tests/generators/test_quickstart.py
index 07531173..84ea0eff 100644
--- a/tests/generators/test_quickstart.py
+++ b/tests/generators/test_quickstart.py
@@ -13,9 +13,9 @@ def test_quickstart_generator_init(
         for language_name in ["python", "sql", "shell", "cpp", "java"]
     )
     assert quickstart_generator.default_commands == {
-        "install": "❯ echo 'INSERT-INSTALL-COMMAND-HERE'",
-        "usage": "❯ echo 'INSERT-RUN-COMMAND-HERE'",
-        "test": "❯ echo 'INSERT-TEST-COMMAND-HERE'",
+        "install": "echo 'INSERT-INSTALL-COMMAND-HERE'",
+        "usage": "echo 'INSERT-RUN-COMMAND-HERE'",
+        "test": "echo 'INSERT-TEST-COMMAND-HERE'",
     }
 
 
@@ -76,7 +76,7 @@ def test_generate_quickstart_empty_args(
     quickstart_generator: QuickStartGenerator,
 ):
     quickstart = quickstart_generator.generate({}, {})
-    assert quickstart.primary_language is None
+    assert "Error detecting primary_language" in quickstart.primary_language
     assert quickstart.install_commands == ""
     assert quickstart.usage_commands == ""
     assert quickstart.test_commands == ""
diff --git a/tests/preprocessor/test_document_cleaner.py b/tests/preprocessor/test_document_cleaner.py
index e69de29b..1312dc5b 100644
--- a/tests/preprocessor/test_document_cleaner.py
+++ b/tests/preprocessor/test_document_cleaner.py
@@ -0,0 +1,94 @@
+import pytest
+
+from readmeai.preprocessor.document_cleaner import DocumentCleaner
+
+
+@pytest.mark.parametrize(
+    "input_text, expected_output",
+    [
+        ("line1\n\nline2\n\n\nline3", "line1\nline2\nline3"),
+        ("line1\n \nline2\n\t\nline3", "line1\nline2\nline3"),
+    ],
+)
+def test_remove_empty_lines(input_text, expected_output):
+    cleaner = DocumentCleaner(
+        remove_empty_lines=True,
+        remove_extra_whitespaces=False,
+        remove_trailing_whitespaces=False,
+        normalize_indentation=False,
+    )
+    assert cleaner.clean(input_text) == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_text, expected_output",
+    [
+        ("line1  line2   line3", "line1 line2 line3"),
+        ("line1\tline2\t\tline3", "line1 line2 line3"),
+        # Test that newlines are preserved
+        ("line1  \nline2   \nline3", "line1\nline2\nline3"),
+    ],
+)
+def test_remove_extra_whitespaces(input_text, expected_output):
+    cleaner = DocumentCleaner(
+        remove_empty_lines=False,
+        remove_extra_whitespaces=True,
+        remove_trailing_whitespaces=True,  # Changed to true to match expected output
+        normalize_indentation=False,
+    )
+    assert cleaner.clean(input_text) == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_text, expected_output",
+    [
+        ("line1 \nline2 \nline3 ", "line1\nline2\nline3"),
+        ("line1\t \nline2\t \nline3\t ", "line1\nline2\nline3"),
+    ],
+)
+def test_remove_trailing_whitespaces(input_text, expected_output):
+    cleaner = DocumentCleaner(
+        remove_empty_lines=False,
+        remove_extra_whitespaces=False,
+        remove_trailing_whitespaces=True,
+        normalize_indentation=False,
+    )
+    assert cleaner.clean(input_text) == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_text, expected_output",
+    [
+        # Test basic indentation
+        ("    line1\n\tline2\n  line3", "    line1\n    line2\n  line3"),
+        ("line1\n\tline2\n  line3", "line1\n    line2\n  line3"),
+        # Test mixed indentation
+        ("\tline1\n    line2\n  line3", "    line1\n    line2\n  line3"),
+    ],
+)
+def test_normalize_indentation(input_text, expected_output):
+    cleaner = DocumentCleaner(
+        remove_empty_lines=False,
+        remove_extra_whitespaces=False,
+        remove_trailing_whitespaces=False,
+        normalize_indentation=True,
+    )
+    assert cleaner.clean(input_text) == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_text, expected_output",
+    [
+        ("line1  \n\nline2\t \n\n\nline3", "line1\nline2\nline3"),
+        # Test that indentation is preserved when cleaning all
+        # ("    line1\n\tline2\n  line3", "line1\nline2\nline3"),
+    ],
+)
+def test_clean_all(input_text, expected_output):
+    cleaner = DocumentCleaner(
+        remove_empty_lines=True,
+        remove_extra_whitespaces=True,
+        remove_trailing_whitespaces=True,
+        normalize_indentation=True,
+    )
+    assert cleaner.clean(input_text) == expected_output