Skip to content

Commit

Permalink
feat: add requirements file and update API endpoint with error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
sumansaurabh committed Nov 13, 2024
1 parent 370a17e commit 8193a22
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 77 deletions.
164 changes: 164 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

**/*.pyc
# C extensions
*.so
__pycache__/

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
25 changes: 15 additions & 10 deletions apiCall.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
def generate_code_tree(file_path: str, content: str, modified_lines: List[int]) -> Dict[str, CodeTree]:
"""Generate a code tree for a file with modified lines."""

url = "http://localhost:8000/api/v1/hook/file/generate/codetree"
url = "https://production-gateway.snorkell.ai/api/v1/hook/file/generate/codetree"
data = {
"file_path": file_path,
"content": content,
Expand All @@ -20,14 +20,19 @@ def generate_code_tree(file_path: str, content: str, modified_lines: List[int])
headers = {
'accept': 'application/json',
'Content-Type': 'application/json',
'api-key': 'skl_ai_D7ZgnFMcAdKj7TcT'
'api-key': 'skl_ai_gQs0G76hUSiCK8Uk'
}

response = requests.post(
url,
headers=headers,
json=data
)

response.raise_for_status()
return response.json()
try:
response = requests.post(
url,
headers=headers,
json=data
)

response.raise_for_status()
return response.json()
except Exception as e:
print(e)
print(f"Error in generating code tree for file - {file_path}")
return {}
129 changes: 66 additions & 63 deletions dualEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,48 +41,14 @@ def __init__(
# Auto-detect device if none specified
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

self.code_encoder = SentenceTransformer(code_model, device=device)
self.doc_encoder = SentenceTransformer(doc_model, device=device)
self.code_weight = code_weight
self.doc_weight = 1 - code_weight
self.functions: List[CodeFunction] = []

def parse_python_file(self, file_path: str) -> List[CodeFunction]:
"""Parse a Python file and extract functions with their documentation."""
with open(file_path, 'r') as file:
content = file.read()

tree = ast.parse(content)
functions = []

for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
# Extract function code
code_lines = content.split('\n')[node.lineno-1:node.end_lineno]
code = '\n'.join(code_lines)

# Extract docstring and comments
docstring = ast.get_docstring(node) or ''

# Extract inline comments
comments = []
for child in ast.walk(node):
if hasattr(child, 'lineno'):
line = code_lines[child.lineno - node.lineno]
if '#' in line:
comments.append(line[line.index('#')+1:].strip())

all_documentation = docstring + '\n' + '\n'.join(comments)

functions.append(CodeFunction(
name=node.name,
code=code,
documentation=all_documentation,
file_path=file_path
))

return functions

def load_documentation(self, docs_path: str) -> Dict[str, str]:
"""Load external documentation from a directory."""
Expand All @@ -96,14 +62,6 @@ def load_documentation(self, docs_path: str) -> Dict[str, str]:

return docs

def preprocess_code(self, code: str) -> str:
"""Preprocess code for better embedding."""
# Remove comments

print(code)
tree = ast.parse(code)
return ast.unparse(tree)

def encode_batch(
self,
texts: List[str],
Expand All @@ -118,26 +76,45 @@ def encode_batch(
convert_to_numpy=True
)


def index_repository(self, repo_path: str, docs_path: str, force_update: bool = False):
"""Index all Python files using both encoders."""
python_files = glob.glob(os.path.join(repo_path, "**/*.py"), recursive=True)


# external_docs = self.load_documentation(docs_path)
index_path = f"{repo_path}/function_index.json"
if not force_update:
# Check if index already exists
print("checking if index exists")

if os.path.exists(index_path):
print("index exists")
self.load_index(index_path)
return

python_files = glob.glob(os.path.join(repo_path, "**/*"), recursive=True)
print("tot files - ", len(python_files))

# filter ["py","ts","cs","c","js", "kt"]
python_files = [file for file in python_files if file.split(".")[-1] in ["py","ts","cs","c","js", "kt"]]

print("selected files - ", len(python_files))

# Collect all texts to encode
codes_to_encode = []
docs_to_encode = []
temp_functions: List[CodeFunction] = []

count = 0
print(1)
for file_path in python_files:

extension = file_path.split(".")[-1]
if extension not in ["py","ts","cs","c","js", "kt"]:
print(f"skipping extension - {extension}")
continue


# count += 1
# if count > 2:
# break
Expand All @@ -151,30 +128,56 @@ def index_repository(self, repo_path: str, docs_path: str, force_update: bool =
code_str = file.read()

code_tree = generate_code_tree(file_path, code_str, [])
code_tree: CodeTree = CodeTree(**code_tree)
pprint(code_tree)
try:
code_tree: CodeTree = CodeTree(**code_tree)
except Exception as e:
print("Error in parsing code tree")
continue
# pprint(code_tree)

if code_tree.methods is not None:
for func, func_dict in code_tree.methods.items():
# Prepare code for embedding
processed_code = func_dict.content
codes_to_encode.append(processed_code)
func_name = func.split("~")[0]

# Combine all documentation
combined_doc = f"MethodName: {func_name} \n{func_dict.docstring}"

docs_to_encode.append(combined_doc)
temp_functions.append(CodeFunction(
name=func_name,
code=func_dict.content,
documentation=func_dict.docstring,
file_path=file_path
))
if code_tree.classes is not None:
for class_details, class_dict in code_tree.classes.items():
for func, func_dict in class_dict.methods.items():
# Prepare code for embedding
processed_code = func_dict.content
codes_to_encode.append(processed_code)
func_name = func.split("~")[0]

# Combine all documentation
combined_doc = f"MethodName: {func_name} \n{func_dict.docstring}"

docs_to_encode.append(combined_doc)
temp_functions.append(CodeFunction(
name=func_name,
code=func_dict.content,
documentation=func_dict.docstring,
file_path=file_path
))
else:
print("No class found in file - ", file_path)


for func, func_dict in code_tree.methods.items():
# Prepare code for embedding
processed_code = func_dict.content
codes_to_encode.append(processed_code)
func_name = func.split("~")[0]

# Combine all documentation
combined_doc = f"MethodName: {func_name} \n{func_dict.docstring}"

docs_to_encode.append(combined_doc)
temp_functions.append(CodeFunction(
name=func_name,
code=func_dict.content,
documentation=func_dict.docstring,
file_path=file_path
))

# Batch encode everything
code_embeddings = self.encode_batch(codes_to_encode, self.code_encoder)
doc_embeddings = self.encode_batch(docs_to_encode, self.doc_encoder)
print(2)

# Assign embeddings to functions
for func, code_emb, doc_emb in zip(temp_functions, code_embeddings, doc_embeddings):
Expand Down
Loading

0 comments on commit 8193a22

Please sign in to comment.