Skip to content

Commit

Permalink
WIP: Refactor chunking into chunker classes
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Oct 9, 2024
1 parent 6db0f36 commit 579dd47
Show file tree
Hide file tree
Showing 4 changed files with 625 additions and 656 deletions.
8 changes: 5 additions & 3 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,14 +311,16 @@ def generate_data(
if not (taxonomy and os.path.exists(taxonomy)):
raise GenerateException(f"Error: taxonomy ({taxonomy}) does not exist.")

date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
document_output_dir = Path(output_dir) / f"documents-{date_suffix}"

leaf_nodes = read_taxonomy_leaf_nodes(
taxonomy, taxonomy_base, yaml_rules, Path(output_dir)
taxonomy, taxonomy_base, yaml_rules, document_output_dir
)
if not leaf_nodes:
raise GenerateException("Error: No new leaf nodes found in the taxonomy.")

name = Path(model_name).stem # Just in case it is a file path
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
output_file_messages = f"messages_{name}_{date_suffix}.jsonl"
output_file_test = f"test_{name}_{date_suffix}.jsonl"
output_file_train = f"train_{name}_{date_suffix}.jsonl"
Expand Down Expand Up @@ -366,7 +368,7 @@ def generate_data(
is_knowledge = False
leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
samples = leaf_node_to_samples(
leaf_node, server_ctx_size, chunk_word_count, Path(output_dir), model_name
leaf_node, server_ctx_size, chunk_word_count, document_output_dir, model_name
)

if not samples:
Expand Down
Loading

0 comments on commit 579dd47

Please sign in to comment.