Skip to content

Commit

Permalink
debug docprocessor output
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Sep 30, 2024
1 parent 883fdcc commit 18c4df1
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,10 +580,21 @@ def chunk_pdfs(
/ "qna.yaml",
)

chunked_pdfs = list(dp.get_processed_dataset())
print(f"THIS IS KHALED: {chunked_pdfs=}")
chunked_pdfs = dp.get_processed_dataset()
formatted = {}
for k, v in chunked_pdfs.to_dict().items():
print(f"{k=}: ; {type(v)=}")
if isinstance(v, list):
formatted[k] = v[:8]
elif isinstance(v, str):
formatted[k] = v.split()[:8]
else:
formatted[k] = v
print(f"THIS IS KHALED: {formatted}")
print(f"THIS IS KHALED: {type(chunked_pdfs)=}")
print(f"THIS IS KHALED: {chunked_pdfs.shape=}")

raise Exception('STOPPING')
return chunked_pdfs


Expand Down

0 comments on commit 18c4df1

Please sign in to comment.