Skip to content

Commit

Permalink
Update embedding_create.py
Browse files Browse the repository at this point in the history
  • Loading branch information
lineUCB authored and FranardoHuang committed Jul 25, 2024
1 parent 0c26071 commit 04033e2
Showing 1 changed file with 34 additions and 41 deletions.
75 changes: 34 additions & 41 deletions rag/file_conversion_router/embedding_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,37 @@
def string_subtraction(main_string, sub_string):
return main_string.replace(sub_string, '', 1) # The '1' ensures only the first occurrence is removed

def traverse_files(path, start_folder_name, url_list, id_list, doc_list):
results = []
# Check if the provided path exists
if not os.path.exists(path):
raise ValueError(f"The provided path '{path}' does not exist.")
folder_tree = f"{start_folder_name} (h1)\n"
for root, dir, files in os.walk(path):
for file in files:
if file.endswith('.pkl'):
path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:]
line = ((len(path_list) - 1) * "--" + path_list[-1] + f" (L{len(path_list)})")
folder_tree += f"{line}\n"

for root, dir, files in os.walk(path):
for file in files:
if file.endswith('.pkl'):
# file path
file_path = os.path.join(root, file)
path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:]
with open(file_path, 'rb') as pkl_file:
print(file_path)
chunks = pickle.load(pkl_file)
for chunk in chunks:
folder_path = ' > '.join(f"{item} (Level{i + 1})" for i, item in enumerate(path_list))
page_path = chunk.titles
id = folder_path + ' > ' + page_path
id_list.append(id)
doc_list.append(chunk.content)
print(chunk.chunk_url)
url = "URLs:\n" + "\n".join(chunk.chunk_url)
url_list.append(url)
'''
Traverse through files
'''
Expand Down Expand Up @@ -55,44 +86,9 @@ def string_subtraction(main_string, sub_string):


def embedding_create(markdown_path,name, embedding_name, folder_name, model):
def string_subtraction(main_string, sub_string):
return main_string.replace(sub_string, '', 1) # The '1' ensures only the first occurrence is removed

'''
Traverse through files
'''

def traverse_files(path, start_folder_name):
results = []
# Check if the provided path exists
if not os.path.exists(path):
raise ValueError(f"The provided path '{path}' does not exist.")
folder_tree = f"{start_folder_name} (h1)\n"
for root, dir, files in os.walk(path):
for file in files:
if file.endswith('.pkl'):
path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:]
line = ((len(path_list) - 1) * "--" + path_list[-1] + f" (L{len(path_list)})")
folder_tree += f"{line}\n"

for root, dir, files in os.walk(path):
for file in files:
if file.endswith('.pkl'):
# file path
file_path = os.path.join(root, file)
path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:]
with open(file_path, 'rb') as pkl_file:
print(file_path)
chunks = pickle.load(pkl_file)
for chunk in chunks:
folder_path = ' > '.join(f"{item} (Level{i + 1})" for i, item in enumerate(path_list))
page_path = chunk.titles
id = folder_path + ' > ' + page_path
id_list.append(id)
doc_list.append(chunk.content)
print(chunk.chunk_url)
url = "URLs:\n" + "\n".join(chunk.chunk_url)
url_list.append(url)
id_list = []
doc_list = []
embedding_list = []
Expand All @@ -102,9 +98,9 @@ def traverse_files(path, start_folder_name):
start = time.time()
# Process each page
# TODO PROCESS DOCUMENTS
docs = traverse_files(markdown_path,name)
docs = traverse_files(markdown_path, name, url_list, id_list, doc_list)

if model=='local' or model=='zephyr':
if model == 'local' or model == 'zephyr':
openai.api_key = "empty"
openai.api_base = "http://localhost:8000/v1"

Expand Down Expand Up @@ -205,7 +201,6 @@ def last_token_pool(last_hidden_states: Tensor,
print(f"Embedding error: {e}")
fail.append(id_list[i])
# count += 1
#
# id_list.extend(ids)
# embedding_list.extend(embedding)
id_list=np.array(id_list)
Expand All @@ -223,8 +218,6 @@ def last_token_pool(last_hidden_states: Tensor,
'url_list': url_list,
'time_list': time_list
}


# Create the folder if it does not exist
if not os.path.exists(folder_name):
os.makedirs(folder_name)
Expand All @@ -235,7 +228,7 @@ def last_token_pool(last_hidden_states: Tensor,
pickle.dump(data_to_store, f)

for i in fail:
print("Failed Embeddings: ",i)
print("Failed Embeddings: ", i)


if __name__ == "__main__":
Expand Down

0 comments on commit 04033e2

Please sign in to comment.