From e96c59aaad0988569530968b4d94d9633d8188de Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Tue, 20 Feb 2024 11:32:20 +0000 Subject: [PATCH 1/7] update --- padiff/__init__.py | 9 ++++--- padiff/checker/actions.py | 6 ++--- padiff/dump_tools.py | 24 ++++++++++++++--- padiff/report/hooks.py | 7 +++-- padiff/report/report.py | 6 +++-- padiff/utils.py | 54 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 14 deletions(-) diff --git a/padiff/__init__.py b/padiff/__init__.py index 4ead7bd..a7a1e12 100644 --- a/padiff/__init__.py +++ b/padiff/__init__.py @@ -27,7 +27,8 @@ from .report.hooks import info_hook from .datas import global_json_laoder as jsons -from . import cinn_diff + +# from . import cinn_diff def module_filter(name): @@ -134,7 +135,7 @@ def __init__(self, method): self.__api__ = True def forward(self, *args, **kwargs): - return self._method(tensor_obj, *args, **kwargs) + return self._method(*args, **kwargs) def __str__(self): return self.__name__ @@ -152,7 +153,7 @@ def __init__(self, method): self.__api__ = True def forward(self, *args, **kwargs): - return self._method(tensor_obj, *args, **kwargs) + return self._method(*args, **kwargs) def __str__(self): return self.__name__ @@ -163,7 +164,7 @@ def __str__(self): else: raise RuntimeError("Required module_type is in [paddle, torch], but received {}".format(method_fullname)) - out = layer(*args, **kwargs) + out = layer(tensor_obj, *args, **kwargs) handle.remove() diff --git a/padiff/checker/actions.py b/padiff/checker/actions.py index 5b4eb4d..c38bda2 100644 --- a/padiff/checker/actions.py +++ b/padiff/checker/actions.py @@ -67,9 +67,9 @@ def __call__(self, file_list_0, file_list_1, cfg): assert len(file_list_0) == len( file_list_1 ), f"number of tensors for compare is not equal, {len(file_list_0)} vs {len(file_list_1)}" - for path_0, path_1 in zip(file_list_0, file_list_1): - tensor_0 = load_numpy(path_0) - tensor_1 = load_numpy(path_1) + for info_0, info_1 in zip(file_list_0, file_list_1): + tensor_0 = load_numpy(info_0["path"]) + tensor_1 = load_numpy(info_1["path"]) if tensor_0.size == 0 or tensor_1.size == 0: if tensor_0.size != tensor_1.size: raise RuntimeError("size of tensors is not equal") diff --git a/padiff/dump_tools.py b/padiff/dump_tools.py index ba22757..192bcb5 100644 --- a/padiff/dump_tools.py +++ b/padiff/dump_tools.py @@ -15,7 +15,8 @@ import json import os, sys import numpy -from .utils import Counter, reset_dir +import paddle +from .utils import Counter, frames_to_string, reset_dir dump_root_path = os.path.join(sys.path[0], "padiff_dump") @@ -79,14 +80,31 @@ def dump_report_node(wrap_node, tensor_dumper): "net_id": wrap_node.fwd_report.net_id, }, "children": [], + "stack": frames_to_string(wrap_node.fwd_report.frames), } for tensor in wrap_node.fwd_report.tensors_for_compare(): file_name = tensor_dumper(tensor.detach().numpy()) - node_info["fwd_outputs"].append(file_name) + node_info["fwd_outputs"].append( + { + "path": file_name, + "shape": str(tensor.shape), + "dtype": str(tensor.dtype), + "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device), + "layout": str(tensor.layout), + } + ) for tensor in wrap_node.bwd_report.tensors_for_compare(): file_name = tensor_dumper(tensor.detach().numpy()) - node_info["bwd_grads"].append(file_name) + node_info["bwd_grads"].append( + { + "path": file_name, + "shape": str(tensor.shape), + "dtype": str(tensor.dtype), + "place": str(tensor.place) if isinstance(tensor, paddle.Tensor) else str(tensor.device), + "layout": str(tensor.layout), + } + ) for child in wrap_node.children: child_info = dump_report_node(child, tensor_dumper) diff --git a/padiff/report/hooks.py b/padiff/report/hooks.py index 23d3711..d8d2367 100644 --- a/padiff/report/hooks.py +++ b/padiff/report/hooks.py @@ -20,6 +20,7 @@ map_structure_and_replace_key, flatten, for_each_grad_tensor, + extract_frame_summary, ) import json import numpy @@ -114,10 +115,12 @@ def info_hook(model, input, output, net_id): else: _model = model + _, frames = extract_frame_summary() + new_in = clone_tensors(input) new_out = clone_tensors(output) - fwd_item = report.put_item("forward", new_in, new_out, _model, net_id) - bwd_item = report.put_item("backward", new_in, new_out, _model, net_id) + fwd_item = report.put_item("forward", new_in, new_out, _model, net_id, frames) + bwd_item = report.put_item("backward", new_in, new_out, _model, net_id, frames) bwd_item.set_forward(fwd_item) report.stack.push_api(_model, fwd_item, bwd_item) diff --git a/padiff/report/report.py b/padiff/report/report.py index 03bca17..aa96a4d 100644 --- a/padiff/report/report.py +++ b/padiff/report/report.py @@ -25,7 +25,7 @@ def __init__(self, marker): self.marker = marker self.stack = LayerStack() - def put_item(self, type_, input_, output, net, net_id): + def put_item(self, type_, input_, output, net, net_id, frames): step = self.counter.get_id() self.items.append( ReportItem( @@ -35,6 +35,7 @@ def put_item(self, type_, input_, output, net, net_id): output=output, net=net, net_id=net_id, # traversal order of sublayers + frames=frames, ) ) return self.items[-1] @@ -49,7 +50,7 @@ def __str__(self): class ReportItem: - def __init__(self, type_, step, input_, output, net, net_id): + def __init__(self, type_, step, input_, output, net, net_id, frames): assert type_ in [ "forward", "backward", @@ -65,6 +66,7 @@ def __init__(self, type_, step, input_, output, net, net_id): self.fwd_item = None # bound to another reportitem, if self.type is "backward" self.bwd_item = None # bound to another reportitem, if self.type is "forward" self.input_grads = self._gen_input_grads() + self.frames = frames def set_forward(self, fwd): assert self.type == "backward", "can't set forward for non-backward item." diff --git a/padiff/utils.py b/padiff/utils.py index 1e31df0..eb2d77f 100644 --- a/padiff/utils.py +++ b/padiff/utils.py @@ -178,3 +178,57 @@ def get_id(self): ret = self.id self.id += 1 return ret + + +""" + tools for recording frame stack +""" + + +import os.path as osp +import traceback + + +def _is_system_package(filename): + exclude = [ + "lib/python", + "/usr/local", + osp.dirname(paddle.__file__), + osp.dirname(torch.__file__), + osp.dirname(__file__), # exclude padiff + ] + for pattern in exclude: + if pattern in filename: + return True + return False + + +def extract_frame_summary(): + """ + extract the current call stack by traceback module. + gather the call information and put them into ReportItem to helper locate the error. + + frame_summary: + line: line of the code + lineno: line number of the file + filename: file name of the stack + name: the function name. + """ + frame_summarys = traceback.StackSummary.extract(traceback.walk_stack(None)) + last_user_fs = None + for fs in frame_summarys: + if not _is_system_package(fs.filename): + last_user_fs = fs + break + assert last_user_fs is not None, "Error happend, can't return None." + return last_user_fs, frame_summarys + + +def frames_to_string(frames, indent=0): + indent = " " * indent + lines = [] + for f in frames: + lines.append( + "{}File {}: {} {}\n{}{}{}".format(indent, f.filename, f.lineno, f.name, indent, indent, f.line) + ) + return "\n".join(lines) From 1446429dbca85ca71482a73968e8e433ca1d7644 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Tue, 20 Feb 2024 11:39:28 +0000 Subject: [PATCH 2/7] update --- padiff/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/padiff/__init__.py b/padiff/__init__.py index a7a1e12..ecefd25 100644 --- a/padiff/__init__.py +++ b/padiff/__init__.py @@ -27,8 +27,7 @@ from .report.hooks import info_hook from .datas import global_json_laoder as jsons - -# from . import cinn_diff +from . import cinn_diff def module_filter(name): From ddf200c787ebb126d6b1a20f147abaa9f6d1d689 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Tue, 20 Feb 2024 11:50:16 +0000 Subject: [PATCH 3/7] update --- padiff/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/padiff/__init__.py b/padiff/__init__.py index ecefd25..d86c187 100644 --- a/padiff/__init__.py +++ b/padiff/__init__.py @@ -27,7 +27,11 @@ from .report.hooks import info_hook from .datas import global_json_laoder as jsons -from . import cinn_diff + +try: + from . import cinn_diff +except: + pass def module_filter(name): From 49e01042c171ec3380736669f951bff9671b9949 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Tue, 20 Feb 2024 11:51:06 +0000 Subject: [PATCH 4/7] update --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 80c80ea..d2a188c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,3 +7,4 @@ pytest-cov regex pytest-xdist torchvision +graphviz From b8d4cfeccc7fdd9a0d9ba8d39f051115033e3aa4 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Sat, 24 Feb 2024 15:40:25 +0000 Subject: [PATCH 5/7] enable tests --- padiff/abstracts/marker.py | 12 +++++++---- padiff/report/hooks.py | 8 ++++---- padiff/utils.py | 2 +- tests/padiff_unittests.py | 2 ++ tests/test_api_to_Layer.py | 4 +--- tests/test_layer_map.py | 42 +++++++++++++++++++------------------- 6 files changed, 37 insertions(+), 33 deletions(-) diff --git a/padiff/abstracts/marker.py b/padiff/abstracts/marker.py index b285e4a..b691abf 100644 --- a/padiff/abstracts/marker.py +++ b/padiff/abstracts/marker.py @@ -62,12 +62,13 @@ def update_unassigned_weights_list(self, layers, mode="all"): def set_layer_map(self, layer_map): _layer_map = [] - for layer in self.traversal_for_assign_weight(): + for layer in self.traversal_for_layer_map(): if layer.model in layer_map: self.unassigned_weights_list_recursively.add(layer.model) _layer_map.append(layer) self.layer_map = _layer_map + self._marked_layer = set(model.model for model in self.layer_map) def auto_layer_map(self, model_place): """ @@ -79,7 +80,7 @@ def auto_layer_map(self, model_place): registered = init_pool.registered_base_models if model_place == "base" else init_pool.registered_raw_models log("Auto set layer_map start searching...") - for layer in self.traversal_for_auto_layer_map(): + for layer in self.traversal_for_layer_map(): if layer.fullname in registered: print(f"++++ {model_place}_model found `{layer.fullname}` add to layer_map ++++") _layer_map.append(layer) @@ -105,11 +106,14 @@ def traversal_for_hook(self): def traversal_for_assign_weight(self): yield self.proxy_model for model in traversal_for_assign_weight(self.proxy_model, self): - if len(list(model.parameters(recursively=False))) == 0: + if ( + model.model not in self.unassigned_weights_list_recursively + and len(list(model.parameters(recursively=False))) == 0 + ): continue yield model - def traversal_for_auto_layer_map(self): + def traversal_for_layer_map(self): yield self.proxy_model for model in traversal_for_assign_weight(self.proxy_model, self): yield model diff --git a/padiff/report/hooks.py b/padiff/report/hooks.py index d8d2367..8187dad 100644 --- a/padiff/report/hooks.py +++ b/padiff/report/hooks.py @@ -21,6 +21,7 @@ flatten, for_each_grad_tensor, extract_frame_summary, + to_sequence, ) import json import numpy @@ -133,8 +134,7 @@ def info_hook(model, input, output, net_id): # two report_item with same id, the step_idx should be corresponded step_idx = len(list(filter(lambda x: x.type == "forward" and x.net_id == net_id, report.items))) - 1 base_report_node = find_base_report_node(net_id, step_idx) - - retval = map_structure_and_replace_key(replace_forward_output(base_report_node), output, output) + retval = map_structure_and_replace_key(replace_forward_output(base_report_node), to_sequence(output), output) __in_info_hook__ = False return retval else: @@ -158,7 +158,7 @@ def tensor_hook(x_grad, bwd_item, nth_tensor, net_id): ) base_report_node = find_base_report_node(net_id, step_idx) - value = numpy.load(base_report_node["bwd_grads"][nth_tensor]) + value = numpy.load(base_report_node["bwd_grads"][nth_tensor]["path"]) if isinstance(x_grad, paddle.Tensor): return paddle.to_tensor(value) else: @@ -259,7 +259,7 @@ def inner(input_): raise RuntimeError( "In single step mode, try to replace tensor by dumpped numpy value, but the number of tensors and numpy is not equal. Maybe the models are not corresponded." ) - value = numpy.load(numpy_file_list[cur_idx]) + value = numpy.load(numpy_file_list[cur_idx]["path"]) if isinstance(input_, paddle.Tensor): return paddle.to_tensor(value) else: diff --git a/padiff/utils.py b/padiff/utils.py index eb2d77f..d3960f0 100644 --- a/padiff/utils.py +++ b/padiff/utils.py @@ -21,7 +21,7 @@ import torch -from paddle.utils import flatten, pack_sequence_as, map_structure +from paddle.utils import flatten, map_structure, pack_sequence_as """ diff --git a/tests/padiff_unittests.py b/tests/padiff_unittests.py index fec933b..9f88d8c 100644 --- a/tests/padiff_unittests.py +++ b/tests/padiff_unittests.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import sys import subprocess @@ -25,6 +26,7 @@ err_info = f"*** ===================== {fpath} ========================= ***\n" err_info += f"{output}\n" print(f"Failed on unittest {fname} with error message \n {err_info}.", end="\n", flush=True) + sys.exit(1) else: print(f"Succeed on unittest {fname}.", end="\n", flush=True) os.system("rm -rf ./tests/padiff_dump ./tests/padiff_log") diff --git a/tests/test_api_to_Layer.py b/tests/test_api_to_Layer.py index 6a546c6..2e35735 100644 --- a/tests/test_api_to_Layer.py +++ b/tests/test_api_to_Layer.py @@ -61,9 +61,7 @@ def test_api_to_Layer(self): inp = paddle.rand((100, 100), dtype="float32") layer(inp) - layer.report - - assert len(layer.report.items) == 12 + assert len(layer.report.items) == 10 if __name__ == "__main__": diff --git a/tests/test_layer_map.py b/tests/test_layer_map.py index 628024c..60c50ac 100644 --- a/tests/test_layer_map.py +++ b/tests/test_layer_map.py @@ -100,23 +100,23 @@ def forward(self, x): class TestCaseName(unittest.TestCase): - def test_layer_map_1(self): - layer = create_model(SimpleLayer1()) - module = create_model(SimpleModule1()) + # def test_layer_map_1(self): + # layer = create_model(SimpleLayer1()) + # module = create_model(SimpleModule1()) - module.set_layer_map([module.model.lstm]) - layer.set_layer_map([layer.model.lstm]) + # module.set_layer_map([module.model.lstm]) + # layer.set_layer_map([layer.model.lstm]) - inp = paddle.to_tensor([[1] * 9]).numpy().astype("int64") - inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) - assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + # inp = paddle.to_tensor([[1] * 9]).numpy().astype("int64") + # inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + # assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." def test_layer_map_2(self): layer = create_model(SimpleLayer2()) module = create_model(SimpleModule2()) - module.set_layer_map([module.model.attn]) layer.set_layer_map([layer.model.attn]) + module.set_layer_map([module.model.attn]) inp = paddle.rand((2, 4, 16)).numpy() inp = ( @@ -126,23 +126,23 @@ def test_layer_map_2(self): assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." - def test_layer_map_3(self): - layer = SimpleLayer3() - module = SimpleModule3() + # def test_layer_map_3(self): + # layer = SimpleLayer3() + # module = SimpleModule3() - layer.eval() - module.eval() + # layer.eval() + # module.eval() - layer = create_model(layer) - module = create_model(module) + # layer = create_model(layer) + # module = create_model(module) - module.set_layer_map([module.model.bn]) - layer.set_layer_map([layer.model.bn]) + # module.set_layer_map([module.model.bn]) + # layer.set_layer_map([layer.model.bn]) - inp = paddle.rand((1, 3, 32, 32)).numpy() - inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + # inp = paddle.rand((1, 3, 32, 32)).numpy() + # inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) - assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + # assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." if __name__ == "__main__": From 3f397f560eca4c3d988decab621b9ecdbda896f9 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Sat, 24 Feb 2024 15:45:14 +0000 Subject: [PATCH 6/7] update --- padiff/abstracts/marker.py | 1 - tests/test_layer_map.py | 42 +++++++++++++++++++------------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/padiff/abstracts/marker.py b/padiff/abstracts/marker.py index b691abf..7f00a00 100644 --- a/padiff/abstracts/marker.py +++ b/padiff/abstracts/marker.py @@ -68,7 +68,6 @@ def set_layer_map(self, layer_map): _layer_map.append(layer) self.layer_map = _layer_map - self._marked_layer = set(model.model for model in self.layer_map) def auto_layer_map(self, model_place): """ diff --git a/tests/test_layer_map.py b/tests/test_layer_map.py index 60c50ac..628024c 100644 --- a/tests/test_layer_map.py +++ b/tests/test_layer_map.py @@ -100,23 +100,23 @@ def forward(self, x): class TestCaseName(unittest.TestCase): - # def test_layer_map_1(self): - # layer = create_model(SimpleLayer1()) - # module = create_model(SimpleModule1()) + def test_layer_map_1(self): + layer = create_model(SimpleLayer1()) + module = create_model(SimpleModule1()) - # module.set_layer_map([module.model.lstm]) - # layer.set_layer_map([layer.model.lstm]) + module.set_layer_map([module.model.lstm]) + layer.set_layer_map([layer.model.lstm]) - # inp = paddle.to_tensor([[1] * 9]).numpy().astype("int64") - # inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) - # assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + inp = paddle.to_tensor([[1] * 9]).numpy().astype("int64") + inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." def test_layer_map_2(self): layer = create_model(SimpleLayer2()) module = create_model(SimpleModule2()) - layer.set_layer_map([layer.model.attn]) module.set_layer_map([module.model.attn]) + layer.set_layer_map([layer.model.attn]) inp = paddle.rand((2, 4, 16)).numpy() inp = ( @@ -126,23 +126,23 @@ def test_layer_map_2(self): assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." - # def test_layer_map_3(self): - # layer = SimpleLayer3() - # module = SimpleModule3() + def test_layer_map_3(self): + layer = SimpleLayer3() + module = SimpleModule3() - # layer.eval() - # module.eval() + layer.eval() + module.eval() - # layer = create_model(layer) - # module = create_model(module) + layer = create_model(layer) + module = create_model(module) - # module.set_layer_map([module.model.bn]) - # layer.set_layer_map([layer.model.bn]) + module.set_layer_map([module.model.bn]) + layer.set_layer_map([layer.model.bn]) - # inp = paddle.rand((1, 3, 32, 32)).numpy() - # inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) + inp = paddle.rand((1, 3, 32, 32)).numpy() + inp = ({"x": torch.as_tensor(inp)}, {"x": paddle.to_tensor(inp)}) - # assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." + assert auto_diff(module, layer, inp, atol=1e-4) is True, "Failed. expected success." if __name__ == "__main__": From f7aa62334961e69fcda7998ea8feccc716996b68 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Sat, 24 Feb 2024 15:49:17 +0000 Subject: [PATCH 7/7] update --- padiff/report/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/padiff/report/hooks.py b/padiff/report/hooks.py index 8187dad..2746005 100644 --- a/padiff/report/hooks.py +++ b/padiff/report/hooks.py @@ -21,8 +21,8 @@ flatten, for_each_grad_tensor, extract_frame_summary, - to_sequence, ) +from paddle.utils import to_sequence import json import numpy import paddle