exo-explore
diff --git a/‎.style.yapf
+6-1 b/‎.style.yapf
+6-1
diff --git a/‎exo/api/chatgpt_api.py
+3-3 b/‎exo/api/chatgpt_api.py
+3-3
diff --git a/‎exo/download/hf/hf_helpers.py
+21-21 b/‎exo/download/hf/hf_helpers.py
+21-21
diff --git a/‎exo/download/hf/hf_shard_download.py
+1-1 b/‎exo/download/hf/hf_shard_download.py
+1-1
diff --git a/‎exo/helpers.py
+1-1 b/‎exo/helpers.py
+1-1
diff --git a/‎exo/inference/debug_inference_engine.py
+1-1 b/‎exo/inference/debug_inference_engine.py
+1-1
diff --git a/‎exo/inference/mlx/models/deepseek_v2.py
+1-1 b/‎exo/inference/mlx/models/deepseek_v2.py
+1-1
diff --git a/‎exo/inference/mlx/models/llama.py
+1-1 b/‎exo/inference/mlx/models/llama.py
+1-1
diff --git a/‎exo/inference/mlx/models/llava.py
+12-12 b/‎exo/inference/mlx/models/llava.py
+12-12
diff --git a/‎exo/inference/mlx/sharded_model.py
+2-2 b/‎exo/inference/mlx/sharded_model.py
+2-2
diff --git a/‎exo/inference/mlx/sharded_utils.py
+3-3 b/‎exo/inference/mlx/sharded_utils.py
+3-3
diff --git a/‎exo/inference/mlx/test_sharded_model.py
+1-1 b/‎exo/inference/mlx/test_sharded_model.py
+1-1
diff --git a/‎exo/inference/tinygrad/inference.py
+5-5 b/‎exo/inference/tinygrad/inference.py
+5-5
@@ -11,4 +11,9 @@ indent_dictionary_value = True
 allow_multiline_dictionary_keys = True
 each_dict_entry_on_separate_line = False
 allow_multiline_lambdas = True
-blank_line_before_nested_class_or_def = False
+blank_line_before_nested_class_or_def = False
+arithmetic_precedence_indication = True
+no_spaces_around_selected_binary_operators = "*,/"
+coalesce_brackets = True
+space_between_ending_comma_and_closing_bracket = False
+split_before_expression_after_opening_paren = False
@@ -158,7 +158,7 @@ def __init__(self, node: Node, inference_engine_classname: str, response_timeout
     self.inference_engine_classname = inference_engine_classname
     self.response_timeout_secs = response_timeout_secs
     self.on_chat_completion_request = on_chat_completion_request
-    self.app = web.Application(client_max_size=100 * 1024 * 1024)  # 100MB to support image upload
+    self.app = web.Application(client_max_size=100*1024*1024)  # 100MB to support image upload
     self.prompts: PrefixDict[str, PromptSession] = PrefixDict()
     self.prev_token_lens: Dict[str, int] = {}
     self.stream_tasks: Dict[str, asyncio.Task] = {}
@@ -171,7 +171,7 @@ def __init__(self, node: Node, inference_engine_classname: str, response_timeout
     )
     cors.add(self.app.router.add_post("/v1/chat/completions", self.handle_post_chat_completions), {"*": cors_options})
     cors.add(self.app.router.add_post("/v1/chat/token/encode", self.handle_post_chat_token_encode), {"*": cors_options})
-    self.static_dir = Path(__file__).parent.parent.parent / "tinychat/examples/tinychat"
+    self.static_dir = Path(__file__).parent.parent.parent/"tinychat/examples/tinychat"
     self.app.router.add_get("/", self.handle_root)
     self.app.router.add_static("/", self.static_dir, name="static")
 
@@ -186,7 +186,7 @@ async def middleware(request):
     return middleware
 
   async def handle_root(self, request):
-    return web.FileResponse(self.static_dir / "index.html")
+    return web.FileResponse(self.static_dir/"index.html")
 
   async def handle_post_chat_token_encode(self, request):
     data = await request.json()
 
@@ -62,12 +62,12 @@ def _add_wildcard_to_directories(pattern: str) -> str:
 
 def get_hf_home() -> Path:
   """Get the Hugging Face home directory."""
-  return Path(os.environ.get("HF_HOME", Path.home() / ".cache" / "huggingface"))
+  return Path(os.environ.get("HF_HOME", Path.home()/".cache"/"huggingface"))
 
 
 async def get_hf_token():
   """Retrieve the Hugging Face token from the user's HF_HOME directory."""
-  token_path = get_hf_home() / "token"
+  token_path = get_hf_home()/"token"
   if await aios.path.exists(token_path):
     async with aiofiles.open(token_path, 'r') as f:
       return (await f.read()).strip()
@@ -85,7 +85,7 @@ async def get_auth_headers():
 def get_repo_root(repo_id: str) -> Path:
   """Get the root directory for a given repo ID in the Hugging Face cache."""
   sanitized_repo_id = repo_id.replace("/", "--")
-  return get_hf_home() / "hub" / f"models--{sanitized_repo_id}"
+  return get_hf_home()/"hub"/f"models--{sanitized_repo_id}"
 
 
 async def fetch_file_list(session, repo_id, revision, path=""):
@@ -181,9 +181,9 @@ async def download_file(
         downloaded_this_session += len(chunk)
         if progress_callback and total_size:
           elapsed_time = (datetime.now() - start_time).total_seconds()
-          speed = int(downloaded_this_session / elapsed_time) if elapsed_time > 0 else 0
+          speed = int(downloaded_this_session/elapsed_time) if elapsed_time > 0 else 0
           remaining_size = total_size - downloaded_size
-          eta = timedelta(seconds=remaining_size / speed) if speed > 0 else timedelta(0)
+          eta = timedelta(seconds=remaining_size/speed) if speed > 0 else timedelta(0)
           status = "in_progress" if downloaded_size < total_size else "complete"
           if DEBUG >= 8: print(f"HF repo file download progress: {file_path=} {elapsed_time=} {speed=} Downloaded={downloaded_size}/{total_size} {remaining_size=} {eta=} {status=}")
           await progress_callback(RepoFileProgressEvent(repo_id, revision, file_path, downloaded_size, downloaded_this_session, total_size, speed, eta, status))
@@ -199,17 +199,17 @@ async def download_repo_files(
   max_parallel_downloads: int = 4
 ) -> Path:
   repo_root = get_repo_root(repo_id)
-  refs_dir = repo_root / "refs"
-  snapshots_dir = repo_root / "snapshots"
-  cachedreqs_dir = repo_root / "cachedreqs"
+  refs_dir = repo_root/"refs"
+  snapshots_dir = repo_root/"snapshots"
+  cachedreqs_dir = repo_root/"cachedreqs"
 
   # Ensure directories exist
   await aios.makedirs(refs_dir, exist_ok=True)
   await aios.makedirs(snapshots_dir, exist_ok=True)
   await aios.makedirs(cachedreqs_dir, exist_ok=True)
 
   # Check if we have a cached commit hash
-  refs_file = refs_dir / revision
+  refs_file = refs_dir/revision
   if await aios.path.exists(refs_file):
     async with aiofiles.open(refs_file, 'r') as f:
       commit_hash = (await f.read()).strip()
@@ -230,13 +230,13 @@ async def download_repo_files(
         await f.write(commit_hash)
 
   # Set up the snapshot directory
-  snapshot_dir = snapshots_dir / commit_hash
+  snapshot_dir = snapshots_dir/commit_hash
   await aios.makedirs(snapshot_dir, exist_ok=True)
 
   # Set up the cached file list directory
-  cached_file_list_dir = cachedreqs_dir / commit_hash
+  cached_file_list_dir = cachedreqs_dir/commit_hash
   await aios.makedirs(cached_file_list_dir, exist_ok=True)
-  cached_file_list_path = cached_file_list_dir / "fetch_file_list.json"
+  cached_file_list_path = cached_file_list_dir/"fetch_file_list.json"
 
   async with aiohttp.ClientSession() as session:
     # Check if we have a cached file list
@@ -261,17 +261,17 @@ async def download_repo_files(
     start_time = datetime.now()
 
     async def download_with_progress(file_info, progress_state):
-      local_path = snapshot_dir / file_info["path"]
+      local_path = snapshot_dir/file_info["path"]
       if await aios.path.exists(local_path) and (await aios.stat(local_path)).st_size == file_info["size"]:
         if DEBUG >= 2: print(f"File already fully downloaded: {file_info['path']}")
         progress_state['completed_files'] += 1
         progress_state['downloaded_bytes'] += file_info["size"]
         file_progress[file_info["path"]] = RepoFileProgressEvent(repo_id, revision, file_info["path"], file_info["size"], 0, file_info["size"], 0, timedelta(0), "complete")
         if progress_callback:
           elapsed_time = (datetime.now() - start_time).total_seconds()
-          overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
+          overall_speed = int(progress_state['downloaded_bytes_this_session']/elapsed_time) if elapsed_time > 0 else 0
           remaining_bytes = total_bytes - progress_state['downloaded_bytes']
-          overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
+          overall_eta = timedelta(seconds=remaining_bytes/overall_speed) if overall_speed > 0 else timedelta(seconds=0)
           status = "in_progress" if progress_state['completed_files'] < total_files else "complete"
           await progress_callback(
             RepoProgressEvent(
@@ -287,9 +287,9 @@ async def file_progress_callback(event: RepoFileProgressEvent):
         file_progress[event.file_path] = event
         if progress_callback:
           elapsed_time = (datetime.now() - start_time).total_seconds()
-          overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
+          overall_speed = int(progress_state['downloaded_bytes_this_session']/elapsed_time) if elapsed_time > 0 else 0
           remaining_bytes = total_bytes - progress_state['downloaded_bytes']
-          overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
+          overall_eta = timedelta(seconds=remaining_bytes/overall_speed) if overall_speed > 0 else timedelta(seconds=0)
           status = "in_progress" if progress_state['downloaded_bytes'] < total_bytes else "complete"
           await progress_callback(
             RepoProgressEvent(
@@ -305,9 +305,9 @@ async def file_progress_callback(event: RepoFileProgressEvent):
       ] = RepoFileProgressEvent(repo_id, revision, file_info["path"], file_info["size"], file_progress[file_info["path"]].downloaded_this_session, file_info["size"], 0, timedelta(0), "complete")
       if progress_callback:
         elapsed_time = (datetime.now() - start_time).total_seconds()
-        overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
+        overall_speed = int(progress_state['downloaded_bytes_this_session']/elapsed_time) if elapsed_time > 0 else 0
         remaining_bytes = total_bytes - progress_state['downloaded_bytes']
-        overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
+        overall_eta = timedelta(seconds=remaining_bytes/overall_speed) if overall_speed > 0 else timedelta(seconds=0)
         status = "in_progress" if progress_state['completed_files'] < total_files else "complete"
         await progress_callback(
           RepoProgressEvent(
@@ -347,11 +347,11 @@ async def get_weight_map(repo_id: str, revision: str = "main") -> Optional[Dict[
 
   # Check if the file exists
   repo_root = get_repo_root(repo_id)
-  snapshot_dir = repo_root / "snapshots"
+  snapshot_dir = repo_root/"snapshots"
   index_file = next((f for f in await aios.listdir(snapshot_dir) if f.endswith("model.safetensors.index.json")), None)
 
   if index_file:
-    index_file_path = snapshot_dir / index_file
+    index_file_path = snapshot_dir/index_file
     if await aios.path.exists(index_file_path):
       async with aiofiles.open(index_file_path, 'r') as f:
         index_data = json.loads(await f.read())
 
@@ -22,7 +22,7 @@ async def ensure_shard(self, shard: Shard) -> Path:
       return self.completed_downloads[shard]
     if self.quick_check:
       repo_root = get_repo_root(shard.model_id)
-      snapshots_dir = repo_root / "snapshots"
+      snapshots_dir = repo_root/"snapshots"
       if snapshots_dir.exists():
         most_recent_dir = max(snapshots_dir.iterdir(), key=lambda x: x.stat().st_mtime)
         return most_recent_dir
 
@@ -169,7 +169,7 @@ def is_valid_uuid(val):
 
 
 def get_or_create_node_id():
-  NODE_ID_FILE = Path(os.path.dirname(os.path.abspath(__file__))) / ".exo_node_id"
+  NODE_ID_FILE = Path(os.path.dirname(os.path.abspath(__file__)))/".exo_node_id"
   try:
     if NODE_ID_FILE.is_file():
       with open(NODE_ID_FILE, "r") as f:
 
@@ -10,7 +10,7 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
   from exo.inference.tinygrad.inference import Tokenizer
   from pathlib import Path
 
-  _tokenizer = Tokenizer(str(Path(model_id) / "tokenizer.model"))
+  _tokenizer = Tokenizer(str(Path(model_id)/"tokenizer.model"))
 
   prompt = "In a single word only, what is the last name of the president of the United States? "
   resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
 
@@ -59,7 +59,7 @@ def __call__(
       mask = mask.astype(h.dtype)
 
     if cache is None:
-      cache = [None] * len(self.layers)
+      cache = [None]*len(self.layers)
 
     for layer, c in zip(self.layers, cache):
       h = layer(h, mask, c)
 
@@ -58,7 +58,7 @@ def __call__(
       mask = create_attention_mask(h, cache)
 
     if cache is None:
-      cache = [None] * len(self.layers)
+      cache = [None]*len(self.layers)
 
     for layer, c in zip(self.layers, cache):
       h = layer(h, mask, cache=c)
 
@@ -74,8 +74,8 @@ def __call__(self, queries, keys, values, mask=None):
     keys = keys.reshape(B, S, num_heads, -1).transpose(0, 2, 3, 1)
     values = values.reshape(B, S, num_heads, -1).transpose(0, 2, 1, 3)
 
-    scale = math.sqrt(1 / queries.shape[-1])
-    scores = (queries * scale) @ keys
+    scale = math.sqrt(1/queries.shape[-1])
+    scores = (queries*scale) @ keys
     if mask is not None:
       scores = scores + mask.astype(scores.dtype)
     scores = mx.softmax(scores, axis=-1)
@@ -129,7 +129,7 @@ def __init__(self, config: VisionConfig):
     self.image_size = config.image_size
     self.patch_size = config.patch_size
 
-    self.class_embedding = mx.zeros((config.hidden_size, ))
+    self.class_embedding = mx.zeros((config.hidden_size,))
 
     self.patch_embedding = nn.Conv2d(
       in_channels=config.num_channels,
@@ -170,12 +170,12 @@ def __call__(
     x = self.embeddings(x)
     x = self.pre_layrnorm(x)
 
-    encoder_states = (x, ) if output_hidden_states else None
+    encoder_states = (x,) if output_hidden_states else None
 
     for l in self.encoder.layers:
       x = l(x, mask=None)
       if output_hidden_states:
-        encoder_states = encoder_states + (x, )
+        encoder_states = encoder_states + (x,)
 
     pooler_output = self.post_layernorm(x[:, 0, :])
     return pooler_output, x, encoder_states
@@ -263,12 +263,12 @@ def __init__(self, config: TextConfig):
     head_dim = config.hidden_size // n_heads
     self.scale = head_dim**-0.5
 
-    self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
-    self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
-    self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
-    self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
+    self.q_proj = nn.Linear(dim, n_heads*head_dim, bias=False)
+    self.k_proj = nn.Linear(dim, n_kv_heads*head_dim, bias=False)
+    self.v_proj = nn.Linear(dim, n_kv_heads*head_dim, bias=False)
+    self.o_proj = nn.Linear(n_heads*head_dim, dim, bias=False)
 
-    rope_scale = (1 / config.rope_scaling["factor"] if config.rope_scaling is not None and config.rope_scaling["type"] == "linear" else 1)
+    rope_scale = (1/config.rope_scaling["factor"] if config.rope_scaling is not None and config.rope_scaling["type"] == "linear" else 1)
     self.rope = nn.RoPE(
       head_dim,
       traditional=config.rope_traditional,
@@ -312,7 +312,7 @@ def __init__(self, dim, hidden_dim):
     self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
 
   def __call__(self, x) -> mx.array:
-    return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+    return self.down_proj(nn.silu(self.gate_proj(x))*self.up_proj(x))
 
 
 class TransformerBlock(nn.Module):
@@ -382,7 +382,7 @@ def __call__(
       mask = mask.astype(h.dtype)
 
     if cache is None:
-      cache = [None] * len(self.layers)
+      cache = [None]*len(self.layers)
 
     for layer, c in zip(self.layers, cache):
       h = layer(h, mask, c)
 
@@ -38,7 +38,7 @@ def sample(logits: mx.array) -> Tuple[mx.array, float]:
         if top_p > 0 and top_p < 1.0:
           token = top_p_sampling(logits, top_p, temp)
         else:
-          token = mx.random.categorical(logits * (1 / temp))
+          token = mx.random.categorical(logits*(1/temp))
 
       return token
 
@@ -74,7 +74,7 @@ def __call__(
     return self.step(request_id, x, temp=temp, top_p=top_p, logit_bias=logit_bias)
 
   def init_cache(self, request_id: str):
-    kv_heads = ([self.model.n_kv_heads] * len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
+    kv_heads = ([self.model.n_kv_heads]*len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
     if self.max_kv_size is not None:
       cache = [RotatingKVCache(self.model.head_dim, n, max_size=self.max_kv_size, keep=4) for n in kv_heads]
     else:
 
@@ -60,7 +60,7 @@ def _get_classes(config: dict):
 
 def load_config(model_path: Path) -> dict:
   try:
-    with open(model_path / "config.json", "r") as f:
+    with open(model_path/"config.json", "r") as f:
       config = json.load(f)
   except FileNotFoundError:
     logging.error(f"Config file not found in {model_path}")
@@ -103,11 +103,11 @@ def load_model_shard(
     "n_layers": shard.n_layers,
   }
 
-  weight_files = glob.glob(str(model_path / "model*.safetensors"))
+  weight_files = glob.glob(str(model_path/"model*.safetensors"))
 
   if not weight_files:
     # Try weight for back-compat
-    weight_files = glob.glob(str(model_path / "weight*.safetensors"))
+    weight_files = glob.glob(str(model_path/"weight*.safetensors"))
 
   if not weight_files:
     logging.error(f"No safetensors found in {model_path}")
 
@@ -38,7 +38,7 @@ def __call__(self, x, cache=None):
 n_layers = 5
 shard1 = Shard("test", 0, n_layers // 2, n_layers)
 sharded_model1 = DummyModel(shard1)
-shard2 = Shard("test", n_layers // 2 + 1, n_layers - 1, n_layers)
+shard2 = Shard("test", n_layers//2 + 1, n_layers - 1, n_layers)
 sharded_model2 = DummyModel(shard2)
 
 model.load_weights("./test_weights.npz")
 
@@ -33,9 +33,9 @@ def build_transformer(model_path: Path, shard: Shard, model_size="8B", device=No
 
   # load weights
   if model_path.is_dir():
-    if (model_path / "model.safetensors.index.json").exists(): weights = load(str(model_path / "model.safetensors.index.json"), shard)
-    elif (model_path / "model.safetensors").exists(): weights = load(str(model_path / "model.safetensors"), shard)
-    else: weights = concat_weights([load(str(model_path / f"consolidated.{i:02d}.pth"), shard) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
+    if (model_path/"model.safetensors.index.json").exists(): weights = load(str(model_path/"model.safetensors.index.json"), shard)
+    elif (model_path/"model.safetensors").exists(): weights = load(str(model_path/"model.safetensors"), shard)
+    else: weights = concat_weights([load(str(model_path/f"consolidated.{i:02d}.pth"), shard) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
   else:
     weights = load(str(model_path), shard)
   weights = convert_from_huggingface(weights, model, MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
@@ -60,7 +60,7 @@ async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_s
     toks = self.tokenizer.encode(prompt)
     h = self.model(Tensor([toks]), start_pos, TEMPERATURE).realize()
 
-    if h.shape == (1, ):
+    if h.shape == (1,):
       start_pos += len(toks)
       start_pos += 1
       n_captured_toks = 0
@@ -76,7 +76,7 @@ async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarr
 
     h = self.model(Tensor(input_data), start_pos, TEMPERATURE).realize()
 
-    if h.shape == (1, ):
+    if h.shape == (1,):
       start_pos += n_captured_toks
       start_pos += 1
       n_captured_toks = 0