Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Actually use auto generate examples script #686

Merged
merged 18 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 301 additions & 0 deletions bin/generate_truss_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
"""
Script to take the Truss examples in https://github.com/basetenlabs/truss-examples-2,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not use truss-examples now that it has the new structure?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, yeah it is truss-examples now, just need to fix this comment

and generate documentation.



Usage:
```
$ poetry run python bin/generate_truss_examples.py
```
"""
import enum
import json
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import List, Optional, Tuple

import yaml

DOC_CONFIGURATION_FILE = "doc.yaml"
TRUSS_EXAMPLES_REPO = "https://github.com/basetenlabs/truss-examples"
DESTINATION_DIR = "truss-examples"
MINT_CONFIG_PATH = "docs/mint.json"


class FileType(enum.Enum):
YAML = "yaml"
PYTHON = "python"


def clone_repo():
"""
If the destination directory exists, remove it.
Then, clone the given repo into the specified directory.
"""
if os.path.exists(DESTINATION_DIR):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: prefer to use pathlib.Path operations over os.path to stay consistent with the style of the rest of the repo

shutil.rmtree(DESTINATION_DIR)

try:
subprocess.run(
["git", "clone", TRUSS_EXAMPLES_REPO, DESTINATION_DIR], check=True
)
print(f"Successfully cloned {TRUSS_EXAMPLES_REPO} to {DESTINATION_DIR}")
except subprocess.CalledProcessError as e:
print(f"Error cloning the repo: {e}")
sys.exit(1)


def fetch_file_contents(path: str):
with open(path, "r") as f:
return f.read()


def _fetch_example_dirs(root_dir: str) -> List[str]:
"""
Walk through the directory structure from the root directory and
find all directories that have the specified file in it.
"""
dirs_with_file = []

for dirpath, _, filenames in os.walk(root_dir):
if DOC_CONFIGURATION_FILE in filenames:
dirs_with_file.append(dirpath)

return dirs_with_file


def _get_example_destination(truss_directory: str) -> Path:
"""
Get the destination directory for the example.
"""
original_path = Path(truss_directory)
folder, example = original_path.parts[1:]
example_file = f"{example}.mdx"
return Path("docs/examples") / folder / example_file


def _get_file_type(file_path: str) -> FileType:
_, extension = os.path.splitext(file_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if extension == ".yaml":
return FileType.YAML

if extension == ".py":
return FileType.PYTHON

raise ValueError(f"Unknown file type: {extension}")


class ContentBlock:
def formatted_content(self) -> str:
raise NotImplementedError


class CodeBlock(ContentBlock):
def __init__(self, file_type: FileType, file_path: str):
self.file_type = file_type
self.file_path = file_path
self.content = ""

def formatted_content(self) -> str:
"""
Outputs code blocks in the format:

```python main.py
def main():
...
```
"""
return f"\n```{self.file_type.value} {self.file_path}\n{self.content}```"


class MarkdownBlock(ContentBlock):
def __init__(self, content: str):
self.content = content

def formatted_content(self) -> str:
# Remove the first comment and space character, such that
# "# Hello" becomes "Hello
return self.content.strip()[2:]


class MarkdownExtractor:
"""
Class that supports ingesting a code file line-by-line, and produces a formatted
mdx file.
"""

def __init__(self, file_type: FileType, file_path: str):
self.file_type = file_type
self.file_path = file_path

self.blocks: List[ContentBlock] = []
self.current_code_block: Optional[CodeBlock] = None

def ingest(self, line: str):
"""
For each line, check that it is a comment by the presence of "#".
If it is a comment, append it to the blocks.

If it is not a comment, either append to the current code block, or
create a new code block if this isn't one.

When this is finished, we can then very easily produce the mdx file.
"""
stripped_line = line.strip()

# Case of Markdown line
if stripped_line.startswith("#"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to handle multiple comments/docstrings?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this -- parsing out docstrings is a little tough. Since for the examples that we are including in the docs we are in charge of, I thin it's easier if we just choose to use the "#" style comments to write the "doc" part of this

self.current_code_block = None
self.blocks.append(MarkdownBlock(line))
else:
if self.current_code_block is None:
self.current_code_block = CodeBlock(self.file_type, self.file_path)
self.blocks.append(self.current_code_block)
self.current_code_block.content += line + "\n"

def _formatted_request_example(self) -> str:
"""
A key part of the mdx file is that each has a <RequestExample> block at the
bottom the file. This generates that for the given file by appending all the
CodeBlocks together.
"""
code_blocks = [block for block in self.blocks if isinstance(block, CodeBlock)]
code_content = "".join([code_block.content for code_block in code_blocks])

return f"""```{self.file_type.value} {self.file_path}\n{code_content}```"""

def mdx_content(self) -> Tuple[str, str]:
full_content = "\n".join([block.formatted_content() for block in self.blocks])

return (
full_content + "\n",
self._formatted_request_example(),
)


def _extract_mdx_content_and_code(full_file_path: str, path: str) -> Tuple[str, str]:
file_content = fetch_file_contents(full_file_path)
file_type = _get_file_type(path)
extractor = MarkdownExtractor(file_type, path)
for line in file_content.splitlines():
extractor.ingest(line)

return extractor.mdx_content()


def _generate_request_example_block(code: str):
return f"""
<RequestExample>
{code}
</RequestExample>
"""


def _generate_truss_example(truss_directory: str):
print("Generating example for: ", truss_directory)
doc_information = yaml.safe_load(
fetch_file_contents(f"{truss_directory}/{DOC_CONFIGURATION_FILE}")
)

example_destination = _get_example_destination(truss_directory)

header = f"""---
title: "{doc_information["title"]}"
description: "{doc_information["description"]}"
---
"""

path_in_examples_repo = "/".join(Path(truss_directory).parts[1:])
link_to_github = f"""
<Card
title="View on Github"
icon="github" href="{TRUSS_EXAMPLES_REPO}/tree/main/{path_in_examples_repo}">
</Card>
"""
files_to_scrape = doc_information["files"]

full_content, code_blocks = zip(
*[
_extract_mdx_content_and_code(Path(truss_directory) / file, file)
for file in files_to_scrape
]
)

full_code_block = "\n".join(code_blocks)
file_content = "\n".join(full_content) + _generate_request_example_block(
full_code_block
)
example_content = f"""{header}\n{link_to_github}\n{file_content}"""
path_to_example = Path(example_destination)
path_to_example.parent.mkdir(parents=True, exist_ok=True)

path_to_example.write_text(example_content)


def _format_group_name(group_name: str) -> str:
"""
This function takes the parent directory name in, and converts it
into a more human readable format for the table of contents.

Note that parent directory names are assumed to be in the format:
* 1_introduction/... (becomes "Introduction")
* 2_image_classification/... (becomes "Image classification")
* 3_llms/... (becomes "LLMs")
"""
lowercase_name = " ".join(group_name.split("_")[1:])
# Capitalize the first letter. We do this rather than
# use .capitalize() or .title() because we want to preserve
# the case of subsequent letters
return lowercase_name[0].upper() + lowercase_name[1:]


def update_toc(example_dirs: List[str]):
"""
Update the table of contents in the README.md file.

Parameters:
example_dirs: List of directories as strings in the form "truss-examples-2/..."
"""

# Exclude the root directory ("truss_examples") from the path
transformed_example_paths = [Path(example).parts[1:] for example in example_dirs]

mint_config = json.loads(fetch_file_contents(MINT_CONFIG_PATH))
navigation = mint_config["navigation"]

examples_section = [item for item in navigation if item["group"] == "Examples"][0]

# Sort examples by the group name
examples_section["pages"] = [
f"examples/{example_path[0]}/{example_path[1]}"
for example_path in sorted(
transformed_example_paths, key=lambda example: example[0]
)
]

serialized_mint_config = json.dumps(mint_config, indent=2)
Path(MINT_CONFIG_PATH).write_text(serialized_mint_config)


def generate_truss_examples():
"""
Walk through the Truss examples repo, and for each
of the examples in the repo, generate documentation.

Finish the process by updating the table of contents.
"""
clone_repo()

example_dirs = _fetch_example_dirs(DESTINATION_DIR)
for truss_directory in example_dirs:
_generate_truss_example(truss_directory)

update_toc(example_dirs)


if __name__ == "__main__":
generate_truss_examples()
Loading
Loading