diff --git a/README.md b/README.md index 2bc54d4..e9e5775 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ venv\Scripts\Activate MacOS: venv\bin\activate +pip uninstall -r requirements.txt -y pip install -r requirements.txt ``` @@ -51,3 +52,35 @@ pip install -r requirements.txt py main.py ``` + +### Create, Delete, Check API Key +- Create API Key +``` +py .\create_api_key.py --payload_dir="payload_dir" --user="user email" --title=[Optional] --description=[Optional] + +example: +py .\create_api_key.py --payload_dir="./test/regression/regression_testxxx/payload/mongodb_payload.json" --user="user@gmail.com" --title="title" --description="description" +``` + +- Delete API Key +``` +py .\delete_api_key.py --payload_dir="payload_dir" --user="user email" --api_key="api key" + +example: +py .\delete_api_key.py --payload_dir="./test/regression/regression_testxxx/payload/mongodb_payload.json" --user="user@gmail.com" --api_key="api_key" +``` + +- Check API key +``` +py .\check_api_key.py --payload_dir="payload_dir" --user="user email" --api_key="api key" + +example: +py .\check_api_key.py --payload_dir="./test/regression/regression_testxxx/payload/mongodb_payload.json" --user="user@gmail.com" --api_key="api_key" +``` + +### MongoDB: +{ + "mongo_uri": "mongodb+srv://{user_name}:{password}@cluster0.ill5gnu.mongodb.net", + "db_name": "oridosai", + "collection_name": "apis" +} \ No newline at end of file diff --git a/chatting.py b/chatting.py index b67a3b7..021e4c1 100644 --- a/chatting.py +++ b/chatting.py @@ -5,6 +5,7 @@ from src.utils.read_json import read_json from src.chatting.ChattingClass import ChattingClass +from src.mongodb.MongoDBClass import MongoDBClass def chatting(args): """ @@ -15,14 +16,26 @@ def chatting(args): payload_data = read_json(args.payload_dir) # Call class instance - chatting = ChattingClass( - data_path=payload_data["data_path"], - api_key=payload_data["api_key"], - model_id=payload_data["model_id"], - temperature=payload_data["temperature"]) - - response = chatting.ask_question(args.question) - print(response) + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=payload_data["mongo_uri"]) + + is_available = mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) + + if is_available: + print("valid api key") + # Call class instance + chatting = ChattingClass( + data_path=payload_data["data_path"], + api_key=payload_data["api_key"], + model_id=payload_data["model_id"], + temperature=payload_data["temperature"]) + + response = chatting.ask_question(args.question) + print(response) + else: + print("invalide api key") gc.collect() @@ -41,11 +54,16 @@ def chatting(args): payload_name = "chatting_payload.json" payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) + user = "user@gmail.com" + api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA1" + # Add options p = argparse.ArgumentParser() p = argparse.ArgumentParser(description="Conversational Agent.") p.add_argument("--payload_dir", type=Path, default=payload_dir, help="payload directory to the test example") p.add_argument("--question", type=str) + p.add_argument("--user", type=Path, default=user, help="user") + p.add_argument("--api_key", type=Path, default=api_key, help="title") args = p.parse_args() chatting(args) diff --git a/check_api_key.py b/check_api_key.py index 11b5440..e39430f 100644 --- a/check_api_key.py +++ b/check_api_key.py @@ -3,12 +3,13 @@ import gc import argparse from pathlib import Path +from urllib.parse import quote_plus from src.utils.read_json import read_json from src.mongodb.MongoDBClass import MongoDBClass -from pathlib import Path -def delete_api_key(args): + +def check_api_key(args): """ main entry point """ @@ -16,11 +17,24 @@ def delete_api_key(args): # Payload payload_data = read_json(args.payload_dir) + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + # Call class instance mongodb = MongoDBClass( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], - mongo_uri=payload_data["mongo_uri"]) + mongo_uri=mongo_uri) mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) @@ -52,4 +66,4 @@ def delete_api_key(args): p.add_argument("--api_key", type=Path, default=api_key, help="title") args = p.parse_args() - delete_api_key(args) \ No newline at end of file + check_api_key(args) \ No newline at end of file diff --git a/create_api_key.py b/create_api_key.py index 02b8202..c3492ca 100644 --- a/create_api_key.py +++ b/create_api_key.py @@ -4,12 +4,12 @@ import argparse from pathlib import Path from datetime import datetime +from urllib.parse import quote_plus from src.utils.read_json import read_json from src.mongodb.MongoDBClass import MongoDBClass -from src.utils.utils import generate_api_key +from src.utils.utils_funcs import generate_api_key from src.models.api_model import APIModel -from pathlib import Path def create_api_key(args): """ @@ -19,11 +19,24 @@ def create_api_key(args): # Payload payload_data = read_json(args.payload_dir) + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + # Call class instance mongodb = MongoDBClass( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], - mongo_uri=payload_data["mongo_uri"]) + mongo_uri=mongo_uri) api_key = generate_api_key() diff --git a/delete_api_key.py b/delete_api_key.py index 5af9270..d864611 100644 --- a/delete_api_key.py +++ b/delete_api_key.py @@ -3,10 +3,10 @@ import gc import argparse from pathlib import Path +from urllib.parse import quote_plus from src.utils.read_json import read_json from src.mongodb.MongoDBClass import MongoDBClass -from pathlib import Path def delete_api_key(args): """ @@ -16,11 +16,24 @@ def delete_api_key(args): # Payload payload_data = read_json(args.payload_dir) + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + # Call class instance mongodb = MongoDBClass( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], - mongo_uri=payload_data["mongo_uri"]) + mongo_uri=mongo_uri) mongodb.delete_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) diff --git a/main.py b/main.py index d590aeb..15b96c3 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import time import argparse from pathlib import Path +from urllib.parse import quote_plus import concurrent.futures from datetime import datetime import json @@ -16,18 +17,31 @@ from src.mathpix.Mathpix import Mathpix from src.mongodb.MongoDBClass import MongoDBClass -from src.utils.utils import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage +from src.utils.utils_funcs import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage -def main(args): +def total_process(args): start_time = time.time() payload_data = read_json(args.payload_dir) + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + # Call class instance mongodb = MongoDBClass( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], - mongo_uri=payload_data["mongo_uri"]) + mongo_uri=mongo_uri) is_available = mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) @@ -268,4 +282,4 @@ def separate_data(path, threasold): p.add_argument("--api_key", type=Path, default=api_key, help="title") args = p.parse_args() - main(args) \ No newline at end of file + total_process(args) \ No newline at end of file diff --git a/mongodb.py b/mongodb.py new file mode 100644 index 0000000..e3158c5 --- /dev/null +++ b/mongodb.py @@ -0,0 +1,49 @@ + +import os +import gc +import argparse +from pathlib import Path + +from src.utils.read_json import read_json +from src.mongodb.MongoDBClass import MongoDBClass + +def mongodb(args): + """ + main entry point + """ + + # Payload + payload_data = read_json(args.payload_dir) + + # Call class instance + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=payload_data["mongo_uri"]) + + mongodb.mongo_connect() + + gc.collect() + +if __name__ == "__main__": + """ + Form command lines + """ + # Clean up buffer memory + gc.collect() + + # Current directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # Payload directory + test_name = "regression_test013" + payload_name = "mongodb_payload.json" + payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) + + # Add options + p = argparse.ArgumentParser() + p = argparse.ArgumentParser(description="Translate text within an image.") + p.add_argument("--payload_dir", type=Path, default=payload_dir, help="payload directory to the test example") + args = p.parse_args() + + mongodb(args) \ No newline at end of file diff --git a/src/main.py b/src/main.py index ddd4f31..c82548b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,3 @@ -import io import os import sys @@ -12,16 +11,55 @@ sys.path.append(project_root) sys.path.append(current_script_directory) +from fastapi.middleware.cors import CORSMiddleware from fastapi import FastAPI from starlette.responses import RedirectResponse from starlette.status import HTTP_201_CREATED +from src.models.main_model import MainModel + + # Create a FastAPI application app = FastAPI(swagger_ui_parameters={"tryItOutEnabled": True}) +# Configure CORS +origins = [ + "http://localhost", + "http://localhost:8080", + "http://localhost:3000", + "https://example.com", + "https://www.example.com", +] + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + # Define a route to handle the root endpoint and redirect to the API documentation @app.get("/") async def root(): return RedirectResponse(app.docs_url) + + if request_body.user == "": + user = "user@gmail.com" + else: + user = request_body.user + + if request_body.api_key == "": + api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA1" + else: + api_key = request_body.api_key + + args = { + 'payload_dir' : payload_dir, + 'user' : user, + 'api_key' : api_key + } + + total_process(args) diff --git a/src/models/create_api_model.py b/src/models/create_api_model.py new file mode 100644 index 0000000..b10b7bb --- /dev/null +++ b/src/models/create_api_model.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel +from typing import Optional + +class CreateAPIModel(BaseModel): + user: Optional[str] = "" + title: Optional[str] = "" + description: Optional[str] = "" + data_id: Optional[str] = "" diff --git a/src/models/finetune_model.py b/src/models/finetune_model.py deleted file mode 100644 index c596d17..0000000 --- a/src/models/finetune_model.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel -from typing import Optional - -class FineTuneModel(BaseModel): - api_key: Optional[str] = "" - data_path: Optional[str] = "" - model: Optional[str] = "gpt-3.5-turbo" - temperature: Optional[float] = 0.3 - max_retries: Optional[int] = 5 diff --git a/src/models/main_model.py b/src/models/main_model.py new file mode 100644 index 0000000..ffc67a8 --- /dev/null +++ b/src/models/main_model.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel +from typing import Optional + +class MainModel(BaseModel): + api_key: Optional[str] = "" + user: Optional[str] = "" + data_id: Optional[str] = "" diff --git a/src/utils/check_api_key.py b/src/utils/check_api_key.py new file mode 100644 index 0000000..c0be1c3 --- /dev/null +++ b/src/utils/check_api_key.py @@ -0,0 +1,41 @@ + +import os +import gc +import argparse +from pathlib import Path +from urllib.parse import quote_plus + +from src.utils.read_json import read_json +from src.mongodb.MongoDBClass import MongoDBClass + + +def check_api_key(args): + """ + main entry point + """ + + # Payload + payload_data = read_json(args['payload_dir']) + + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + + # Call class instance + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=mongo_uri) + + mongodb.check_validation_api(api_key=str(Path(args['api_key'])), user=str(Path(args['user']))) + + gc.collect() \ No newline at end of file diff --git a/src/utils/create_api.py b/src/utils/create_api.py new file mode 100644 index 0000000..9137745 --- /dev/null +++ b/src/utils/create_api.py @@ -0,0 +1,55 @@ + +import os +import gc +import argparse +from pathlib import Path +from datetime import datetime +from urllib.parse import quote_plus + +from src.utils.read_json import read_json +from src.mongodb.MongoDBClass import MongoDBClass +from src.utils.utils_funcs import generate_api_key +from src.models.api_model import APIModel + +def create_api_key(args): + """ + main entry point + """ + + # Payload + payload_data = read_json(args['payload_dir']) + + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + + # Call class instance + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=mongo_uri) + + api_key = generate_api_key() + + data:APIModel = { + "user": str(Path(args['user'])), + "api": api_key, + "title": str(Path(args['title'])), + "description": str(Path(args['description'])), + "is_removed": False, + "created_at": datetime.now(), + "updated_at": datetime.now(), + } + + mongodb.create_api(data) + + gc.collect() diff --git a/src/utils/delete_api_key.py b/src/utils/delete_api_key.py new file mode 100644 index 0000000..7b86f4f --- /dev/null +++ b/src/utils/delete_api_key.py @@ -0,0 +1,40 @@ + +import os +import gc +import argparse +from pathlib import Path +from urllib.parse import quote_plus + +from src.utils.read_json import read_json +from src.mongodb.MongoDBClass import MongoDBClass + +def delete_api_key(args): + """ + main entry point + """ + + # Payload + payload_data = read_json(args['payload_dir']) + + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + + # Call class instance + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=mongo_uri) + + mongodb.delete_api(api_key=str(Path(args['api_key'])), user=str(Path(args['user']))) + + gc.collect() \ No newline at end of file diff --git a/src/utils/total_process.py b/src/utils/total_process.py new file mode 100644 index 0000000..284ef6f --- /dev/null +++ b/src/utils/total_process.py @@ -0,0 +1,257 @@ + +import os +import gc +import time +import argparse +from pathlib import Path +from urllib.parse import quote_plus +import concurrent.futures +from datetime import datetime +import json + +from src.utils.read_json import read_json +from src.utils.image_translator import ImageTranslator +from src.utils.chatgpt_communicator import ChatGPTCommunicator +from src.pdf2img.Pdf2ImgClass import Pdf2ImgClass +from src.finetune.FineTuningClass import FineTuningClass +from src.mathpix.Mathpix import Mathpix +from src.mongodb.MongoDBClass import MongoDBClass + +from src.utils.utils_funcs import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage + +def total_process(args): + start_time = time.time() + + payload_data = read_json(args['payload_dir']) + + # Your MongoDB Atlas connection details + mongodb_username = payload_data["mongodb_username"] + mongodb_password = payload_data["mongodb_password"] + mongodb_cluster_name = payload_data["mongodb_cluster_name"] + mongodb_database_name = payload_data["mongodb_database_name"] + + # Escape the mongodb_username and mongodb_password + mongodb_escaped_username = quote_plus(mongodb_username) + mongodb_escaped_password = quote_plus(mongodb_password) + + # Construct the MongoDB Atlas URI + mongo_uri = f"mongodb+srv://{mongodb_escaped_username}:{mongodb_escaped_password}@{mongodb_cluster_name}.mongodb.net/{mongodb_database_name}" + + # Call class instance + mongodb = MongoDBClass( + db_name=payload_data["db_name"], + collection_name=payload_data["collection_name"], + mongo_uri=mongo_uri) + + is_available = mongodb.check_validation_api(api_key=str(Path(args['api_key'])), user=str(Path(args['user']))) + + if is_available: + print("valid api key") + # Separate the data + separate_data(payload_data["data_path"], payload_data["threasold_image_percent_of_pdf"]) + + # pdf to image feature + pdf2img = Pdf2ImgClass( + data_path=payload_data["pdf_data_path"], + parent_path=payload_data["data_path"]) + + pdf2img.pdf2img() + + # img to text feature + # Read images from the image directory + image_list = [] + image_data_path = payload_data["images_data_path"] + + try: + image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")] + except FileNotFoundError: + print("The specified path does not exist or is inaccessible.") + + # Call class instance + img_translator = ImageTranslator(api_key=payload_data["api_key"]) + mathpix = Mathpix(mathpix_app_id=payload_data["mathpix_app_id"], mathpix_app_key=payload_data["mathpix_app_key"]) + + # Loop over number of images and append all images + # NOTE: User can upload image and add image URLs or just upload image or just add image URLs + images = [] + image_paths = [] + if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0): + for image in image_list: + image_path = os.path.join(image_data_path, image) + # Encode image + base64_image = img_translator.encode_image(image_path) + images.append((base64_image, False, "auto")) + image_paths.append(image_path) + for img_url in payload_data["image_url"]: + images.append((img_url, True, "auto")) + image_paths.append(img_url) + elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0): + for image in image_list: + image_path = os.path.join(image_data_path, image) + # Encode image + base64_image = img_translator.encode_image(image_path) + images.append((base64_image, False, "auto")) + image_paths.append(image_path) + elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0): + for img_url in payload_data["image_url"]: + images.append((img_url, True, "auto")) + image_paths.append(img_url) + + if payload_data["is_gpt"]: + for image in images: + if payload_data["is_parallel"]: + params = [{ + img_translator: img_translator, + image: image + }] * payload_data["parallel_count"] + + with concurrent.futures.ThreadPoolExecutor() as executor: + results = list(executor.map(lambda args: img2txt(*args), params)) + + result = make_one_result(payload_data, results) + else: + result = img2txt(img_translator, image) + + save_to_txt(payload_data, result) + else: + for path in image_paths: + result = mathpix.latex({ + 'src': mathpix.image_uri(path), + 'ocr': ['math', 'text'], + 'formats': ['text', 'latex_styled', 'asciimath', 'mathml', 'latex_simplified'], + 'format_options': { + 'text': { + 'transforms': ['rm_spaces', 'rm_newlines'], + 'math_delims': ['$', '$'] + }, + 'latex_styled': {'transforms': ['rm_spaces']} + } + }) + + # print(json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) + + save_to_txt(payload_data, json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) + + # fine tuning + fine_tune = FineTuningClass( + data_path=payload_data["train_data_path"], + parent_path=payload_data["data_path"], + api_key=payload_data["api_key"], + model=payload_data["model"], + temperature=payload_data["temperature"], + max_retries=payload_data["max_retries"]) + + # Generate the train and eval data + fine_tune.train_generation() + + # Generate the jsonl + fine_tune.jsonl_generation() + + # Fine tuning + fine_tune.finetune() + + # Write into log file + end_time = time.time() + msg = f"Total processing time: {end_time - start_time} seconds" + print(msg) + else: + print("invalide api key") + + gc.collect() + +def save_to_txt(payload_data, result: str): + current_time = datetime.now().strftime('%y_%m_%d_%H_%M_%S') + train_path = os.path.join(payload_data["data_path"], "train_data") + os.makedirs(train_path, exist_ok=True) # This line will create the directory if it doesn't exist + + with open(f'{train_path}/{current_time}_data.txt', "a", encoding="utf-8") as f: + f.write(result + "\n\n") # Append the new data to the end of the file + +def img2txt(img_translator: ImageTranslator, image): + max_retries = 5 + last_error = "" + + img_translator_response = None # Define the variable and initialize it to None + + for attempt in range(max_retries): + try: + response = img_translator.analyze_images([image]) + + if "choices" in response and response["choices"]: + first_choice = response["choices"][0] + if "message" in first_choice and "content" in first_choice["message"] and first_choice["message"]["content"]: + img_translator_response = first_choice["message"]["content"] + break # Successful response, break out of the loop + else: + last_error = "No valid content in the response." + else: + last_error = "The response structure is not as expected." + + except Exception as e: + last_error = f"Attempt {attempt + 1} failed: {e}" + + if img_translator_response: + break # If a successful response is obtained, exit the loop + + if img_translator_response is None: + raise Exception("Failed to get a valid response after " + str(max_retries) + " attempts. Last error: " + last_error) + + return img_translator_response + +def make_one_result(payload_data, results: [str]): + response = payload_data["merge_prompt"] + for index, result in enumerate(results): + response += f"\nresult {index + 1}: {result}" + + # Create chatGPT communicator + chatgpt_communicator = ChatGPTCommunicator(api_key=payload_data["api_key"], language_model=payload_data["language_model"]) + + # Start conversation with ChatGPT using the transcribed or translated text + chatgpt_communicator.create_chat(response) + + # Get conversation with ChatGPT + max_retries = 3 + chatgpt_response = None + + for attempt in range(max_retries): + try: + chatgpt_response = chatgpt_communicator.get_response() + # Check if the response is valid (not None and not empty) + if chatgpt_response: + break # Valid response, break out of the loop + except Exception as e: + print(f"Attempt {attempt + 1} failed: {e}") + if attempt == max_retries - 1: + raise Exception(f"Failed to get a valid response from ChatGPT after {max_retries} attempts. Last error: {e}") + + # Print response and use it somewhere else + # print(chatgpt_response) + + + return chatgpt_response + +def separate_data(path, threasold): + source_folder = path + images_folder = os.path.join(path, "images") + pdf_folder = os.path.join(path, "pdf") + train_folder = os.path.join(path, "train_data") + + file_list = os.listdir(source_folder) + for file_name in file_list: + file_path = os.path.join(source_folder, file_name) + if os.path.isfile(file_path): + if is_image_file(file_path): + copy_file_to_folder(file_path, images_folder) + elif is_text_file(file_path): + copy_file_to_folder(file_path, train_folder) + elif is_pdf_file(file_path): + # if check_pdf_content(file_path) == "text": + # copy_file_to_folder(file_path, train_folder) + # if has_text(file_path): + # copy_file_to_folder(file_path, train_folder) + if get_image_pages_percentage(file_path) < threasold: + # pdf is mostly consist of text + copy_file_to_folder(file_path, train_folder) + else: + # pdf is mostly consist of image + copy_file_to_folder(file_path, pdf_folder) \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils_funcs.py similarity index 100% rename from src/utils/utils.py rename to src/utils/utils_funcs.py diff --git a/test/regression/regression_test003/payload/mongodb_payload.json b/test/regression/regression_test003/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test003/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test003/payload/payload.json b/test/regression/regression_test003/payload/payload.json index 67d4c7c..889709c 100644 --- a/test/regression/regression_test003/payload/payload.json +++ b/test/regression/regression_test003/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8Yk6D8wc", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test004/payload/mongodb_payload.json b/test/regression/regression_test004/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test004/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test004/payload/payload.json b/test/regression/regression_test004/payload/payload.json index cc0b157..4109d88 100644 --- a/test/regression/regression_test004/payload/payload.json +++ b/test/regression/regression_test004/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8YkcUEuT", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test005/payload/mongodb_payload.json b/test/regression/regression_test005/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test005/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test005/payload/payload.json b/test/regression/regression_test005/payload/payload.json index 1b99f86..0708b72 100644 --- a/test/regression/regression_test005/payload/payload.json +++ b/test/regression/regression_test005/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8YkrFBD6", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test006/payload/mongodb_payload.json b/test/regression/regression_test006/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test006/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test006/payload/payload.json b/test/regression/regression_test006/payload/payload.json index d1a591a..b9526a5 100644 --- a/test/regression/regression_test006/payload/payload.json +++ b/test/regression/regression_test006/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8Yl91t6J", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test007/payload/mongodb_payload.json b/test/regression/regression_test007/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test007/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test007/payload/payload.json b/test/regression/regression_test007/payload/payload.json index 6441034..c502f06 100644 --- a/test/regression/regression_test007/payload/payload.json +++ b/test/regression/regression_test007/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8Yh1901T", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test008/payload/mongodb_payload.json b/test/regression/regression_test008/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test008/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test008/payload/payload.json b/test/regression/regression_test008/payload/payload.json index e0a3655..f27d82e 100644 --- a/test/regression/regression_test008/payload/payload.json +++ b/test/regression/regression_test008/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8YlS9jjv", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test009/payload/mongodb_payload.json b/test/regression/regression_test009/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test009/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test009/payload/payload.json b/test/regression/regression_test009/payload/payload.json index 2d6b9a4..19f72ab 100644 --- a/test/regression/regression_test009/payload/payload.json +++ b/test/regression/regression_test009/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test010/payload/mongodb_payload.json b/test/regression/regression_test010/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test010/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test010/payload/payload.json b/test/regression/regression_test010/payload/payload.json index 766f0e9..7da5966 100644 --- a/test/regression/regression_test010/payload/payload.json +++ b/test/regression/regression_test010/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test011/payload/mongodb_payload.json b/test/regression/regression_test011/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test011/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test011/payload/payload.json b/test/regression/regression_test011/payload/payload.json index 776ca07..1051617 100644 --- a/test/regression/regression_test011/payload/payload.json +++ b/test/regression/regression_test011/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test012/payload/mongodb_payload.json b/test/regression/regression_test012/payload/mongodb_payload.json new file mode 100644 index 0000000..67a399d --- /dev/null +++ b/test/regression/regression_test012/payload/mongodb_payload.json @@ -0,0 +1,8 @@ +{ + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", + "db_name": "oridosai", + "collection_name": "apis" +} diff --git a/test/regression/regression_test012/payload/payload.json b/test/regression/regression_test012/payload/payload.json index 4346006..b48ecdc 100644 --- a/test/regression/regression_test012/payload/payload.json +++ b/test/regression/regression_test012/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file diff --git a/test/regression/regression_test013/payload/mongodb_payload.json b/test/regression/regression_test013/payload/mongodb_payload.json index dbae6ab..67a399d 100644 --- a/test/regression/regression_test013/payload/mongodb_payload.json +++ b/test/regression/regression_test013/payload/mongodb_payload.json @@ -1,5 +1,8 @@ { - "mongo_uri": null, + "mongodb_username": "mongodb_username", + "mongodb_password": "mongodb_password", + "mongodb_cluster_name": "mongodb_cluster_name", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } diff --git a/test/regression/regression_test013/payload/payload.json b/test/regression/regression_test013/payload/payload.json index 76e461a..1bbd092 100644 --- a/test/regression/regression_test013/payload/payload.json +++ b/test/regression/regression_test013/payload/payload.json @@ -17,7 +17,10 @@ "model_id": "ft:gpt-3.5-turbo-0613:personal::8ZmyxMfe", "temperature": 0.3, "max_retries": 5, - "mongo_uri": null, + "mongodb_username": "scarlett1130", + "mongodb_password": "4zv2NnaSbKIWMRPv", + "mongodb_cluster_name": "cluster0.ill5gnu", + "mongodb_database_name": "", "db_name": "oridosai", "collection_name": "apis" } \ No newline at end of file