Skip to content

Commit

Permalink
Fixed linting errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
souradipp76 committed Nov 8, 2024
1 parent 98a2dba commit 98b0973
Show file tree
Hide file tree
Showing 16 changed files with 647 additions and 484 deletions.
2 changes: 1 addition & 1 deletion doc_generator/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
"""

# example constant variable
NAME = "doc_generator"
NAME = "doc_generator"
75 changes: 43 additions & 32 deletions doc_generator/index/convert_json_to_markdown.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
"""
Convert Json to Markdown
"""

import json
from pathlib import Path

from doc_generator.types import AutodocRepoConfig, FileSummary, \
FolderSummary, ProcessFileParams, TraverseFileSystemParams
from doc_generator.types import (
AutodocRepoConfig,
FileSummary,
FolderSummary,
ProcessFileParams,
TraverseFileSystemParams,
)
from doc_generator.utils.traverse_file_system import traverse_file_system
from doc_generator.utils.file_utils import get_file_name

Expand All @@ -29,60 +35,65 @@ def count_files(process_file_params: ProcessFileParams):
files += 1
return

traverse_file_system(TraverseFileSystemParams(
str(input_root),
project_name,
count_files,
None,
[],
file_prompt,
folder_prompt,
content_type,
target_audience,
link_hosted
))
traverse_file_system(
TraverseFileSystemParams(
str(input_root),
project_name,
count_files,
None,
[],
file_prompt,
folder_prompt,
content_type,
target_audience,
link_hosted,
)
)

# Process and create markdown files for each code file in the project
def process_file(process_file_params: ProcessFileParams) -> None:
file_path = Path(process_file_params.file_path)
file_name = process_file_params.file_name
content = file_path.read_text(encoding='utf-8')
content = file_path.read_text(encoding="utf-8")

if not content or len(content) == 0:
return

markdown_file_path = output_root.joinpath(
file_path.relative_to(input_root))
file_path.relative_to(input_root)
)

# Create the output directory if it doesn't exist
markdown_file_path.parent.mkdir(parents=True, exist_ok=True)

# Parse JSON content based on the file name
data = json.loads(content)
if file_name == 'summary.json':
if file_name == "summary.json":
data = FolderSummary(**data)
else:
data = FileSummary(**data)

# Only include the file if it has a summary
markdown = ''
markdown = ""
if data.summary:
markdown = f"[View code on GitHub]({data.url})\n\n{data.summary}\n"
if data.questions:
markdown += f"## Questions: \n{data.questions}"

output_path = get_file_name(markdown_file_path, '.', '.md')
output_path.write_text(markdown, encoding='utf-8')
output_path = get_file_name(markdown_file_path, ".", ".md")
output_path.write_text(markdown, encoding="utf-8")

traverse_file_system(TraverseFileSystemParams(
str(input_root),
project_name,
process_file,
None,
[],
file_prompt,
folder_prompt,
content_type,
target_audience,
link_hosted
))
traverse_file_system(
TraverseFileSystemParams(
str(input_root),
project_name,
process_file,
None,
[],
file_prompt,
folder_prompt,
content_type,
target_audience,
link_hosted,
)
)
42 changes: 26 additions & 16 deletions doc_generator/index/create_vector_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Create Vector Store
"""

import fnmatch
import os
from pathlib import Path
Expand All @@ -14,31 +15,34 @@


def should_ignore(file_name: str, ignore: List[str]):
return any(fnmatch.fnmatch(file_name, pattern)
for pattern in ignore)
return any(fnmatch.fnmatch(file_name, pattern) for pattern in ignore)


def process_file(file_path: str, ignore: List[str]):
"""
Process File
"""

def read_file(path):
with open(path, 'r', encoding='utf8') as file:
with open(path, "r", encoding="utf8") as file:
return file.read()

if should_ignore(file_path, ignore):
return None

try:
file_contents = read_file(file_path)
metadata = {'source': file_path}
metadata = {"source": file_path}
doc = Document(page_content=file_contents, metadata=metadata)
return doc
except Exception as e:
print(f"Error reading file {file_path}: {str(e)}")
return None


def process_directory(directory_path: str, ignore: List[str]) -> List[Document]:
def process_directory(
directory_path: str, ignore: List[str]
) -> List[Document]:
"""
Process Directory
"""
Expand All @@ -47,8 +51,10 @@ def process_directory(directory_path: str, ignore: List[str]) -> List[Document]:
files = os.listdir(directory_path)
except Exception as e:
print(e)
raise FileNotFoundError(f"Could not read directory: {directory_path}. \
Did you run `sh download.sh`?") from e
raise FileNotFoundError(
f"Could not read directory: {directory_path}. \
Did you run `sh download.sh`?"
) from e

for file in files:
if should_ignore(file, ignore):
Expand All @@ -68,6 +74,7 @@ class RepoLoader(BaseLoader):
"""
RepoLoader
"""

def __init__(self, file_path: str, ignore: List[str]):
super().__init__()
self.file_path = file_path
Expand All @@ -77,7 +84,13 @@ def load(self) -> List[Document]:
return process_directory(self.file_path, self.ignore)


def create_vector_store(root: str, output: str, ignore: List[str], llms: List[LLMModels], device: str) -> None:
def create_vector_store(
root: str,
output: str,
ignore: List[str],
llms: List[LLMModels],
device: str,
) -> None:
"""
Create Vector Store
"""
Expand All @@ -88,19 +101,16 @@ def create_vector_store(root: str, output: str, ignore: List[str], llms: List[LL
# Split the text into chunks
print(f"Splitting text into chunks for {len(raw_docs)} docs")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100
chunk_size=1000, chunk_overlap=100
)
docs = text_splitter.split_documents(raw_docs)
# Create the vectorstore
print('Creating vector store....')
print("Creating vector store....")
vector_store = HNSWLib.from_documents(
docs,
get_embeddings(llm.name, device),
docstore=InMemoryDocstore()
docs, get_embeddings(llm.name, device), docstore=InMemoryDocstore()
)

print('Saving vector store output....')
print("Saving vector store output....")
vector_store.save(output)

print('Done creating vector store....')
print("Done creating vector store....")
101 changes: 56 additions & 45 deletions doc_generator/index/index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Index
"""

from pathlib import Path
from doc_generator.types import AutodocRepoConfig

Expand All @@ -11,59 +12,69 @@

def index(config: AutodocRepoConfig):
"""Index"""
json_path = Path(config.output) / 'docs' / 'json'
markdown_path = Path(config.output) / 'docs' / 'markdown'
data_path = Path(config.output) / 'docs' / 'data'
json_path = Path(config.output) / "docs" / "json"
markdown_path = Path(config.output) / "docs" / "markdown"
data_path = Path(config.output) / "docs" / "data"

# Ensure directories exist
json_path.mkdir(parents=True, exist_ok=True)
markdown_path.mkdir(parents=True, exist_ok=True)
data_path.mkdir(parents=True, exist_ok=True)

# Process the repository to create JSON files
print('Processing repository...')
process_repository(AutodocRepoConfig(
name=config.name,
repository_url=config.repository_url,
root=config.root,
output=str(json_path),
llms=config.llms,
priority=config.priority,
max_concurrent_calls=config.max_concurrent_calls,
add_questions=config.add_questions,
ignore=config.ignore,
file_prompt=config.file_prompt,
folder_prompt=config.folder_prompt,
chat_prompt=config.chat_prompt,
content_type=config.content_type,
target_audience=config.target_audience,
link_hosted=config.link_hosted,
peft_model_path=config.peft_model_path,
device=config.device
))
print("Processing repository...")
process_repository(
AutodocRepoConfig(
name=config.name,
repository_url=config.repository_url,
root=config.root,
output=str(json_path),
llms=config.llms,
priority=config.priority,
max_concurrent_calls=config.max_concurrent_calls,
add_questions=config.add_questions,
ignore=config.ignore,
file_prompt=config.file_prompt,
folder_prompt=config.folder_prompt,
chat_prompt=config.chat_prompt,
content_type=config.content_type,
target_audience=config.target_audience,
link_hosted=config.link_hosted,
peft_model_path=config.peft_model_path,
device=config.device,
)
)

# Convert the JSON files to Markdown
print('Creating markdown files...')
convert_json_to_markdown(AutodocRepoConfig(
name=config.name,
repository_url=config.repository_url,
root=str(json_path),
output=str(markdown_path),
llms=config.llms,
priority=config.priority,
max_concurrent_calls=config.max_concurrent_calls,
add_questions=config.add_questions,
ignore=config.ignore,
file_prompt=config.file_prompt,
folder_prompt=config.folder_prompt,
chat_prompt=config.chat_prompt,
content_type=config.content_type,
target_audience=config.target_audience,
link_hosted=config.link_hosted,
peft_model_path=config.peft_model_path,
device=config.device
))
print("Creating markdown files...")
convert_json_to_markdown(
AutodocRepoConfig(
name=config.name,
repository_url=config.repository_url,
root=str(json_path),
output=str(markdown_path),
llms=config.llms,
priority=config.priority,
max_concurrent_calls=config.max_concurrent_calls,
add_questions=config.add_questions,
ignore=config.ignore,
file_prompt=config.file_prompt,
folder_prompt=config.folder_prompt,
chat_prompt=config.chat_prompt,
content_type=config.content_type,
target_audience=config.target_audience,
link_hosted=config.link_hosted,
peft_model_path=config.peft_model_path,
device=config.device,
)
)

# Create a vector store from the Markdown documents
print('Creating vector files...')
create_vector_store(str(config.root), str(data_path), config.ignore, config.llms, config.device)
print("Creating vector files...")
create_vector_store(
str(config.root),
str(data_path),
config.ignore,
config.llms,
config.device,
)
Loading

0 comments on commit 98b0973

Please sign in to comment.