-
Notifications
You must be signed in to change notification settings - Fork 50
Update DI to CU converter for GA #131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| # Rename to .env | ||
| HOST="<fill in your target endpoint here>" | ||
|
|
||
| API_VERSION = "2025-05-01-preview" | ||
| API_VERSION = "2025-11-01" | ||
|
|
||
| SUBSCRIPTION_KEY = "<fill in your API Key here>" # This is your API Key if you have one or can be your Subscription ID | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| from rich import print # For colored output | ||
|
|
||
| # imports from same project | ||
| from constants import CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES | ||
| from constants import CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES, COMPLETION_MODEL, EMBEDDING_MODEL | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @aainav269, I found we only validate the length of field name and do not check/normalize the field name by our current field limitation. It seems like we also don't check/remove the field format. Do you recall the discussion of field name normalization in this tool?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we decided then to remove the fields that exceed the field name length. One point of discussion was if we shorten the field name, could there be another field with that name? Ex: if we have ...._Yes and ...._No and we shorten both, it would be .... I don't think we ever validated the field format. I think we assumed that if the field was already generated by DI, the format would apply to CU as well. What are you thinking of enforcing for this?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CU has more limitations on field name than DI like no white spaces and only underscores and no other symbols. If we didn't ignore this intentionally. I will add some logics to do the validation and modification. |
||
| from field_definitions import FieldDefinitions | ||
|
|
||
| # schema constants subject to change | ||
|
|
@@ -48,7 +48,7 @@ def format_angle(angle: float) -> float: | |
| formatted_num = f"{rounded_angle:.7f}".rstrip('0') # Remove trailing zeros | ||
| return float(formatted_num) | ||
|
|
||
| def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions) -> dict: | ||
| def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> dict: | ||
| """ | ||
| Convert DI 4.0 preview Custom Document fields.json to analyzer.json format. | ||
| Args: | ||
|
|
@@ -79,7 +79,11 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional | |
| # build analyzer.json appropriately | ||
| analyzer_data = { | ||
| "analyzerId": analyzer_id, | ||
| "baseAnalyzerId": "prebuilt-documentAnalyzer", | ||
| "baseAnalyzerId": "prebuilt-document", | ||
| "models": { | ||
| "completion": COMPLETION_MODEL, | ||
| "embedding": EMBEDDING_MODEL | ||
| }, | ||
| "config": { | ||
| "returnDetails": True, | ||
| # Add the following line as a temp workaround before service issue is fixed. | ||
|
|
@@ -121,6 +125,17 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional | |
| else: | ||
| analyzer_json_path = fields_json_path.parent / 'analyzer.json' | ||
|
|
||
| # Add knowledgeSources section if container info is provided | ||
| if target_container_sas_url and target_blob_folder: | ||
| analyzer_data["knowledgeSources"] = [ | ||
| { | ||
| "kind": "labeledData", | ||
| "containerUrl": target_container_sas_url, | ||
| "prefix": target_blob_folder, | ||
| "fileListPath": "" | ||
| } | ||
| ] | ||
|
|
||
| # Ensure target directory exists | ||
| analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) | ||
|
|
||
|
|
@@ -287,7 +302,11 @@ def recursive_convert_di_label_to_cu_helper(value: dict) -> dict: | |
| di_label["valueDate"] = date_string # going with the default | ||
| elif value_type == "number": | ||
| try: | ||
| di_label["valueNumber"] = float(value.get("content")) # content can be easily converted to a float | ||
| content_val = value.get("content") | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @aainav269, I encountered some errors when I tried to convert fields labeled by region in DI studio which would not have content. I'm wondering if we encountered this error before and if we are good to set value as None.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do remember seeing these region fields before, but only in DI 3.1. I think we decided to just ignore these region fields when converting to CU.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I need to set the value to None to avoid the errors. |
||
| if not content_val: | ||
| di_label["valueNumber"] = None | ||
| else: | ||
| di_label["valueNumber"] = float(content_val) # content can be easily converted to a float | ||
| except Exception as ex: | ||
| # strip the string of all non-numerical values and periods | ||
| string_value = value.get("content") | ||
|
|
@@ -296,16 +315,27 @@ def recursive_convert_di_label_to_cu_helper(value: dict) -> dict: | |
| # if more than one period exists, remove them all | ||
| if cleaned_string.count('.') > 1: | ||
| print("More than one decimal point exists, so will be removing them all.") | ||
| cleaned_string = cleaned_string = re.sub(r'\.', '', string_value) | ||
| di_label["valueNumber"] = float(cleaned_string) | ||
| cleaned_string = re.sub(r'\.', '', string_value) | ||
|
|
||
| if not cleaned_string: | ||
| di_label["valueNumber"] = None | ||
| else: | ||
| di_label["valueNumber"] = float(cleaned_string) | ||
| elif value_type == "integer": | ||
| try: | ||
| di_label["valueInteger"] = int(value.get("content")) # content can be easily converted to an int | ||
| content_val = value.get("content") | ||
| if not content_val: | ||
| di_label["valueInteger"] = None | ||
| else: | ||
| di_label["valueInteger"] = int(content_val) # content can be easily converted to an int | ||
| except Exception as ex: | ||
| # strip the string of all non-numerical values | ||
| string_value = value.get("content") | ||
| cleaned_string = re.sub(r'[^0-9]', '', string_value) | ||
| di_label["valueInteger"] = int(cleaned_string) | ||
| if not cleaned_string: | ||
| di_label["valueInteger"] = None | ||
| else: | ||
| di_label["valueInteger"] = int(cleaned_string) | ||
| else: | ||
| di_label[value_part] = value.get("content") | ||
| di_label["spans"] = value.get("spans", []) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.