Skip to content

Commit f3a6995

Browse files
committed
Update monitor config setup
1 parent 9a0e25d commit f3a6995

File tree

3 files changed

+10
-19
lines changed

3 files changed

+10
-19
lines changed

configs/150M/3090.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
name_model = "150M"
22
project = "debug_150m_zero_band"
3-
metric_logger_type = "dummy"
43

54
[train]
65
micro_bs = 16 # change this base on the gpu

src/zeroband/train.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ class TrainConfig(BaseConfig):
5656

5757

5858
class MonitorConfig(BaseConfig):
59-
enable_monitor: bool = False
6059
log_flush_interval: int = 10
6160
base_url: str | None = None
6261
auth_token: str | None = None
@@ -76,7 +75,7 @@ class Config(BaseConfig):
7675
data: DataConfig = DataConfig()
7776
optim: OptimConfig = OptimConfig()
7877
train: TrainConfig
79-
monitor: MonitorConfig = MonitorConfig()
78+
monitor: MonitorConfig | None = None
8079

8180

8281
def train(config: Config):
@@ -167,8 +166,9 @@ def train(config: Config):
167166
logger_cls = WandbMetricLogger if config.metric_logger_type == "wandb" else DummyMetricLogger
168167
metric_logger = logger_cls(project=config.project, config=config.model_dump(), resume=False)
169168

170-
monitor = HttpMonitor(config=config.model_dump(), resume=False)
171-
monitor.set_stage("init")
169+
if config.monitor is not None:
170+
monitor = HttpMonitor(config=config.model_dump(), resume=False)
171+
monitor.set_stage("init")
172172

173173
train_dataloader_iterator = iter(train_dataloader)
174174

@@ -182,7 +182,7 @@ def train(config: Config):
182182
# if we don't use diloco we don't print the outer step logs
183183
logger.info(f"outer_step step: {outer_step}")
184184

185-
if world_info.rank == 0:
185+
if world_info.rank == 0 and config.monitor is not None:
186186
monitor.set_stage("inner_loop")
187187

188188
for inner_step in range(num_inner_steps):
@@ -245,12 +245,13 @@ def train(config: Config):
245245

246246
if world_info.rank == 0:
247247
metric_logger.log(metrics)
248-
monitor.log(metrics)
248+
if config.monitor is not None:
249+
monitor.log(metrics)
249250

250251
logger.info(log)
251252

252253
if config.diloco is not None:
253-
if world_info.rank == 0:
254+
if world_info.rank == 0 and config.monitor is not None:
254255
monitor.set_stage("outer_loop")
255256
diloco.step(model)
256257

@@ -263,8 +264,9 @@ def train(config: Config):
263264
break
264265

265266
if world_info.rank == 0:
266-
monitor.finish()
267267
metric_logger.finish()
268+
if config.monitor is not None:
269+
monitor.finish()
268270

269271

270272
if __name__ == "__main__":

src/zeroband/utils/monitor.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ class HttpMonitor:
2525

2626
def __init__(self, config, *args, **kwargs):
2727
self.data = []
28-
self.enabled = config["monitor"]["enable_monitor"]
2928
self.log_flush_interval = config["monitor"]["log_flush_interval"]
3029
self.base_url = config["monitor"]["base_url"]
3130
self.auth_token = config["monitor"]["auth_token"]
@@ -50,19 +49,13 @@ def _remove_duplicates(self):
5049
self.data = unique_logs
5150

5251
def set_stage(self, stage: str):
53-
if not self.enabled:
54-
return
55-
5652
import time
5753

5854
# add a new log entry with the stage name
5955
self.data.append({"stage": stage, "time": time.time()})
6056
self._handle_send_batch(flush=True) # it's useful to have the most up-to-date stage broadcasted
6157

6258
def log(self, data: dict[str, Any]):
63-
if not self.enabled:
64-
return
65-
6659
# Lowercase the keys in the data dictionary
6760
lowercased_data = {k.lower(): v for k, v in data.items()}
6861
self.data.append(lowercased_data)
@@ -131,9 +124,6 @@ def _finish(self):
131124
return False
132125

133126
def finish(self):
134-
if not self.enabled:
135-
return
136-
137127
self.set_stage("finishing")
138128

139129
# Send any remaining logs

0 commit comments

Comments
 (0)