Skip to content
This repository has been archived by the owner on Jul 12, 2024. It is now read-only.

Commit

Permalink
feat: api key auth feature
Browse files Browse the repository at this point in the history
Co-authored-by: loyal812 <jh.chan0812@gmail.com>
  • Loading branch information
eureka320 and loyal812 committed Mar 27, 2024
1 parent c3439d8 commit 6dab2fe
Show file tree
Hide file tree
Showing 15 changed files with 166 additions and 165 deletions.
1 change: 0 additions & 1 deletion check_api_key.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def delete_api_key(args):

user = "user@gmail.com"
api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA"
description = "description"

# Add options
p = argparse.ArgumentParser()
Expand Down
1 change: 0 additions & 1 deletion delete_api_key.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def delete_api_key(args):

user = "user@gmail.com"
api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA"
description = "description"

# Add options
p = argparse.ArgumentParser()
Expand Down
225 changes: 122 additions & 103 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from src.pdf2img.Pdf2ImgClass import Pdf2ImgClass
from src.finetune.FineTuningClass import FineTuningClass
from src.mathpix.Mathpix import Mathpix
from src.mongodb.MongoDBClass import MongoDBClass

from src.utils.utils import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage

Expand All @@ -22,113 +23,126 @@ def main(args):

payload_data = read_json(args.payload_dir)

# Separate the data
separate_data(payload_data["data_path"], payload_data["threasold_image_percent_of_pdf"])

# pdf to image feature
pdf2img = Pdf2ImgClass(
data_path=payload_data["pdf_data_path"],
parent_path=payload_data["data_path"])

pdf2img.pdf2img()

# img to text feature
# Read images from the image directory
image_list = []
image_data_path = payload_data["images_data_path"]

try:
image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")]
except FileNotFoundError:
print("The specified path does not exist or is inaccessible.")

# Call class instance
img_translator = ImageTranslator(api_key=payload_data["api_key"])
mathpix = Mathpix(mathpix_app_id=payload_data["mathpix_app_id"], mathpix_app_key=payload_data["mathpix_app_key"])

# Loop over number of images and append all images
# NOTE: User can upload image and add image URLs or just upload image or just add image URLs
images = []
image_paths = []
if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0):
for image in image_list:
image_path = os.path.join(image_data_path, image)
# Encode image
base64_image = img_translator.encode_image(image_path)
images.append((base64_image, False, "auto"))
image_paths.append(image_path)
for img_url in payload_data["image_url"]:
images.append((img_url, True, "auto"))
image_paths.append(img_url)
elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0):
for image in image_list:
image_path = os.path.join(image_data_path, image)
# Encode image
base64_image = img_translator.encode_image(image_path)
images.append((base64_image, False, "auto"))
image_paths.append(image_path)
elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0):
for img_url in payload_data["image_url"]:
images.append((img_url, True, "auto"))
image_paths.append(img_url)

if payload_data["is_gpt"]:
for image in images:
if payload_data["is_parallel"]:
params = [{
img_translator: img_translator,
image: image
}] * payload_data["parallel_count"]

with concurrent.futures.ThreadPoolExecutor() as executor:
results = list(executor.map(lambda args: img2txt(*args), params))

result = make_one_result(payload_data, results)
else:
result = img2txt(img_translator, image)
mongodb = MongoDBClass(
db_name=payload_data["db_name"],
collection_name=payload_data["collection_name"],
mongo_uri=payload_data["mongo_uri"])

is_available = mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user)))

if is_available:
print("valid api key")
# Separate the data
separate_data(payload_data["data_path"], payload_data["threasold_image_percent_of_pdf"])

# pdf to image feature
pdf2img = Pdf2ImgClass(
data_path=payload_data["pdf_data_path"],
parent_path=payload_data["data_path"])

pdf2img.pdf2img()

# img to text feature
# Read images from the image directory
image_list = []
image_data_path = payload_data["images_data_path"]

save_to_txt(payload_data, result)
try:
image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")]
except FileNotFoundError:
print("The specified path does not exist or is inaccessible.")

# Call class instance
img_translator = ImageTranslator(api_key=payload_data["api_key"])
mathpix = Mathpix(mathpix_app_id=payload_data["mathpix_app_id"], mathpix_app_key=payload_data["mathpix_app_key"])

# Loop over number of images and append all images
# NOTE: User can upload image and add image URLs or just upload image or just add image URLs
images = []
image_paths = []
if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0):
for image in image_list:
image_path = os.path.join(image_data_path, image)
# Encode image
base64_image = img_translator.encode_image(image_path)
images.append((base64_image, False, "auto"))
image_paths.append(image_path)
for img_url in payload_data["image_url"]:
images.append((img_url, True, "auto"))
image_paths.append(img_url)
elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0):
for image in image_list:
image_path = os.path.join(image_data_path, image)
# Encode image
base64_image = img_translator.encode_image(image_path)
images.append((base64_image, False, "auto"))
image_paths.append(image_path)
elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0):
for img_url in payload_data["image_url"]:
images.append((img_url, True, "auto"))
image_paths.append(img_url)

if payload_data["is_gpt"]:
for image in images:
if payload_data["is_parallel"]:
params = [{
img_translator: img_translator,
image: image
}] * payload_data["parallel_count"]

with concurrent.futures.ThreadPoolExecutor() as executor:
results = list(executor.map(lambda args: img2txt(*args), params))

result = make_one_result(payload_data, results)
else:
result = img2txt(img_translator, image)

save_to_txt(payload_data, result)
else:
for path in image_paths:
result = mathpix.latex({
'src': mathpix.image_uri(path),
'ocr': ['math', 'text'],
'formats': ['text', 'latex_styled', 'asciimath', 'mathml', 'latex_simplified'],
'format_options': {
'text': {
'transforms': ['rm_spaces', 'rm_newlines'],
'math_delims': ['$', '$']
},
'latex_styled': {'transforms': ['rm_spaces']}
}
})

# print(json.loads(json.dumps(result, indent=4, sort_keys=True))["text"])

save_to_txt(payload_data, json.loads(json.dumps(result, indent=4, sort_keys=True))["text"])

# fine tuning
fine_tune = FineTuningClass(
data_path=payload_data["train_data_path"],
parent_path=payload_data["data_path"],
api_key=payload_data["api_key"],
model=payload_data["model"],
temperature=payload_data["temperature"],
max_retries=payload_data["max_retries"])

# Generate the train and eval data
fine_tune.train_generation()

# Generate the jsonl
fine_tune.jsonl_generation()

# Fine tuning
fine_tune.finetune()

# Write into log file
end_time = time.time()
msg = f"Total processing time: {end_time - start_time} seconds"
print(msg)
else:
for path in image_paths:
result = mathpix.latex({
'src': mathpix.image_uri(path),
'ocr': ['math', 'text'],
'formats': ['text', 'latex_styled', 'asciimath', 'mathml', 'latex_simplified'],
'format_options': {
'text': {
'transforms': ['rm_spaces', 'rm_newlines'],
'math_delims': ['$', '$']
},
'latex_styled': {'transforms': ['rm_spaces']}
}
})

# print(json.loads(json.dumps(result, indent=4, sort_keys=True))["text"])

save_to_txt(payload_data, json.loads(json.dumps(result, indent=4, sort_keys=True))["text"])

# fine tuning
fine_tune = FineTuningClass(
data_path=payload_data["train_data_path"],
parent_path=payload_data["data_path"],
api_key=payload_data["api_key"],
model=payload_data["model"],
temperature=payload_data["temperature"],
max_retries=payload_data["max_retries"])

# Generate the train and eval data
fine_tune.train_generation()
print("invalide api key")

# Generate the jsonl
fine_tune.jsonl_generation()

# Fine tuning
fine_tune.finetune()

# Write into log file
end_time = time.time()
msg = f"Total processing time: {end_time - start_time} seconds"
print(msg)
gc.collect()

def save_to_txt(payload_data, result: str):
Expand Down Expand Up @@ -243,10 +257,15 @@ def separate_data(path, threasold):
payload_name = "payload.json"
payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name)

user = "user@gmail.com"
api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA1"

# Add options
p = argparse.ArgumentParser()
p = argparse.ArgumentParser(description="Translate text within an image.")
p.add_argument("--payload_dir", type=Path, default=payload_dir, help="payload directory to the test example")
p.add_argument("--user", type=Path, default=user, help="user")
p.add_argument("--api_key", type=Path, default=api_key, help="title")
args = p.parse_args()

main(args)
49 changes: 0 additions & 49 deletions mongodb.py

This file was deleted.

5 changes: 4 additions & 1 deletion test/regression/regression_test003/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8Yk6D8wc",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test004/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8YkcUEuT",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test005/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8YkrFBD6",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test006/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8Yl91t6J",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test007/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8Yh1901T",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test008/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "ft:gpt-3.5-turbo-0613:personal::8YlS9jjv",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
5 changes: 4 additions & 1 deletion test/regression/regression_test009/payload/payload.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@
"model": "gpt-3.5-turbo",
"model_id": "",
"temperature": 0.3,
"max_retries": 5
"max_retries": 5,
"mongo_uri": null,
"db_name": "oridosai",
"collection_name": "apis"
}
Loading

0 comments on commit 6dab2fe

Please sign in to comment.