HiThink-Research · jxqhhh · Dec 5, 2025 · Dec 5, 2025
diff --git a/benchmark_code/HLE/eval_hle.py b/benchmark_code/HLE/eval_hle.py
@@ -17,7 +17,7 @@ def extract_label(data):
 
 def extract_pred(data):
     if "predict_result" in data:
-        return f'["{data["predict_result"]}"]'
+        return str(data["predict_result"])
     if "prediction" in data:
         return str(data["prediction"])
     if "choices" in data and isinstance(data["choices"], list):

diff --git a/benchmark_code/HLE/preprocess.py b/benchmark_code/HLE/preprocess.py
@@ -0,0 +1,42 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+from eval_hle import generate_prompt
+from transformers import AutoProcessor, PreTrainedTokenizer
+
+def convert_sample_to_input_ids(
+    s:dict,
+    prompt_type: str,
+    tokenizer: PreTrainedTokenizer
+):
+    question = s.get("question")
+    image = s.get("image", "")
+
+    if image:
+        question = "<|vision_start|><|image_pad|><|vision_end|>"+question
+        inputs = {
+            "prompt": generate_prompt(question),
+            "multi_modal_data": {"image": [image]}
+        }
+        return inputs
+    else:
+        contents = [
+            {"type": "text", "text": generate_prompt(question)}
+        ]
+        messages = [
+            {
+                "role": "user",
+                "content": contents
+            }
+        ]
+        final_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False
+        )
+        #处理当输入是多模态Processor的时候tokenizer异常
+        if "Processor" in type(tokenizer).__name__:
+            tokenizer = tokenizer.tokenizer
+        input_ids = tokenizer.encode(final_prompt)
+        return input_ids
diff --git a/config/hle.yaml b/config/hle.yaml
@@ -0,0 +1,19 @@
+# 步骤（以Qwen/Qwen3-Omni-30B-A3B-Thinking为例）：
+# 1. 下载数据集文件（https://huggingface.co/datasets/cais/hle/blob/main/data/test-00000-of-00001.parquet）到本地
+# 2. 下载Qwen/Qwen3-Omni-30B-A3B-Thinking模型到本地，然后安装运行模型所需依赖：
+# pip install transformers==4.57.3
+# pip install flash-attn --no-build-isolation
+# pip install -r requirements.txt
+# 3. 运行python run_pipeline.py --config config/hle.yaml --model_path {your_local_path_to_Qwen3-Omni-30B-A3B-Thinking} --backend hf 
+
+save_dir: results
+
+preprocess: benchmark_code.HLE.preprocess
+thinker_max_new_tokens: 4096
+
+tasks:
+    HLE:
+        type: text
+        data_path: [YOUR_PATH_TO_TEST_PARQUET]
+        compare_func:
+            path: benchmark_code/HLE/eval_hle.py
diff --git a/inference/backend/multimodal.py b/inference/backend/multimodal.py
@@ -1,5 +1,7 @@
 import math
 import librosa
+import base64
+from io import BytesIO
 from PIL import Image
 
 
@@ -23,8 +25,16 @@ def load_multimodal_data(
     audio: list[str] | None = None,
     return_dict=False,
 ):
+    def open_image(x):
+        if x.startswith("data:image"):
+            x = x.split(',', 1)[1]
+            image_data = base64.b64decode(x)
+            return Image.open(BytesIO(image_data))
+        else:
+            return Image.open(x)
+
     if image is not None:
-        image = [Image.open(x) for x in image]
+        image = [open_image(x) for x in image]
         image = [to_rgb(x) for x in image]
         # For Qwen2-VL, Qwen2.5-VL, Omni, 解决processor短边小于28的bug
         # OCRBench数据集需要调整，MIN_PIXELS

diff --git a/inference/backend/predict_async_hf.py b/inference/backend/predict_async_hf.py
@@ -109,7 +109,7 @@ async def predict_sample(
         else:  # list[int]
             input_len = len(inputs)  # input_ids
             input_tensors = dict(
-                inputs=torch.tensor([inputs], device=model.device),
+                input_ids=torch.tensor([inputs], device=model.device),
                 attention_mask=torch.ones((1, input_len), dtype=torch.long, device=model.device)
             )
 
@@ -126,7 +126,7 @@ async def predict_sample(
                 **kwargs,
             )  # input + output, shape: (batch_size, total_len)
 
-            output = output_ids[0].tolist()[input_len:]
+            output = output_ids[0].flatten().tolist()[input_len:]
 
         elif otype == 'turn_taking':
             model = getattr(model, 'thinker', model)  # Qwen2.5-Omni

diff --git a/inference/data_server.py b/inference/data_server.py
@@ -3,6 +3,7 @@
 import argparse
 import random
 import ujson as json
+import base64
 import asyncio
 import time
 import importlib
@@ -820,13 +821,25 @@ async def write_result_async(i_data, i_sample):
 
 def write_result(i_data):
     """将推理结果写入jsonl文件"""
+    def default_serizalizer(obj):
+        return f"Lost data of JSON-unserializable type {obj.__class__.__name__}"
+    def dumps(r):
+        try:
+            json_kwargs = dict(
+                ensure_ascii=False,
+                escape_forward_slashes=False  # ujson需要设置，以保持与标准库（json）一致的行为
+            )
+            return json.dumps(r, **json_kwargs)
+        except Exception as e:
+            import json
+            json_kwargs = dict(
+                ensure_ascii=False,
+                default=default_serizalizer  # 处理不支持JSON化的元素（如bytes）
+            )
+            return json.dumps(r, **json_kwargs)
     try:
         d = file_list[i_data]
         # print('save:', d.output_path)
-        json_kwargs = dict(
-            ensure_ascii=False,
-            escape_forward_slashes=False,  # ujson需要设置，以保持与标准库（json）一致的行为
-        )
         os.makedirs(os.path.split(d.output_path)[0], exist_ok=True)
         if d.n_write == -1:
             d.n_write = 0
@@ -837,15 +850,15 @@ def write_result(i_data):
             if args.no_order:
                 while d.results_new:
                     r = d.results_new.popleft()
-                    f.write(json.dumps(r, **json_kwargs) + '\n')
+                    f.write(dumps(r) + '\n')
                     d.n_write += 1
             else:
                 while d.n_write < len(d.results):
                     i = d.n_write
                     r = d.results[i]
                     if r is None:
                         break
-                    f.write(json.dumps(r, **json_kwargs) + '\n')
+                    f.write(dumps(r) + '\n')
                     d.results[i] = None  # 释放内存
                     d.n_write += 1
 

diff --git a/requirements.txt b/requirements.txt
@@ -22,4 +22,5 @@ zmq
 pybase64
 sanic
 rouge_chinese
-jieba
+jieba
+librosa
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,4 +22,5 @@ zmq @@
     pybase64
     sanic
     rouge_chinese
-    jieba
+    jieba
+    librosa