allenai · natolambert · Nov 3, 2025 · gemini-code-assist · Nov 3, 2025 · gemini-code-assist
diff --git a/open_instruct/dataset_transformation.py b/open_instruct/dataset_transformation.py
@@ -403,6 +403,12 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
         "{% if not has_system %}"
         "{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai.<|im_end|>\n' }}"
-        "{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai.<|im_end|>\n' }}"
+        "{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. You do not currently have access to any functions. <functions></functions><|im_end|>\n' }}"
-        "{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai.<|im_end|>\n' }}"
+        "{{ '<|im_start|>system\nYou are Olmo, a helpful AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. You do not currently have access to any functions. <functions></functions><|im_end|>\n' }}"
         "{% endif %}"
+        "{% set last_user_index = -1 %}"
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "{% set last_user_index = loop.index0 %}"
+        "{% endif %}"
+        "{% endfor %}"
         "{% for message in messages %}"
         "{% if message['role'] == 'system' %}"
         "{{ '<|im_start|>system\n' + message['content'] }}"
@@ -418,10 +424,18 @@ def visualize_token_role(tokens: list[int], masks: list[int], tokenizer: PreTrai
         "{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}"
         "{% endif %}"
         "{% elif message['role'] == 'assistant' %}"
+        "{% set assistant_content = message.get('content', '') %}"
+        "{% set reasoning_content = '' %}"
+        "{% if '</think>' in assistant_content %}"
+        "{% set think_split = assistant_content.split('</think>') %}"
+        "{% set reasoning_content = think_split[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}"
+        "{% set assistant_content = think_split[-1].lstrip('\\n') %}"
+        "{% endif %}"
-        "{% if '</think>' in assistant_content %}"
-        "{% set think_split = assistant_content.split('</think>') %}"
-        "{% set reasoning_content = think_split[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}"
-        "{% set assistant_content = think_split[-1].lstrip('\\n') %}"
-        "{% endif %}"
+        "{% if '<think>' in assistant_content and '</think>' in assistant_content %}"
+        "{% set _before_think, _after_think = assistant_content.split('<think>', 1) %}"
+        "{% set _reasoning, _after_reasoning = _after_think.split('</think>', 1) %}"
+        "{% set reasoning_content = _reasoning.strip() %}"
+        "{% set assistant_content = (_before_think ~ _after_reasoning).lstrip('\\n') %}{% endif %}"
-        "{% if '</think>' in assistant_content %}"
-        "{% set think_split = assistant_content.split('</think>') %}"
-        "{% set reasoning_content = think_split[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}"
-        "{% set assistant_content = think_split[-1].lstrip('\\n') %}"
-        "{% endif %}"
+        "{% if '<think>' in assistant_content and '</think>' in assistant_content %}"
+        "{% set _before_think, _after_think = assistant_content.split('<think>', 1) %}"
+        "{% set _reasoning, _after_reasoning = _after_think.split('</think>', 1) %}"
+        "{% set reasoning_content = _reasoning.strip() %}"
+        "{% set assistant_content = (_before_think ~ _after_reasoning).lstrip('\\n') %}{% endif %}"
         "{{ '<|im_start|>assistant\n' }}"
-        "{% if message.get('content', none) is not none %}"
-        "{{ message['content'] }}"
+        "{% if loop.index0 > last_user_index and reasoning_content.strip() %}"
+        "{{ '<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' }}"
         "{% endif %}"
+        "{{ assistant_content }}"
         "{% if message.get('function_calls', none) is not none %}"
         "{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}"
         "{% endif %}"