From 65aaf841a63e81e91d12ac956ce848c700646683 Mon Sep 17 00:00:00 2001 From: Alan Ayala Date: Fri, 23 Aug 2024 15:04:38 -0400 Subject: [PATCH] Implemented 1D kernels for factorizable sizes < 1024 --- CHANGELOG.md | 6 + CMakeLists.txt | 4 +- clients/tests/accuracy_test_adhoc.cpp | 21 ++-- docs/doxygen/Doxyfile | 2 +- library/src/device/kernel-generator.py | 161 +++++++++++++++++++++++-- 5 files changed, 176 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 906a4f80..39546ab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Documentation for rocFFT is available at [https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). +## rocFFT 1.0.29 for ROCm 6.2.1 + +### Optimizations + +* Implemented 1D kernels for factorizable sizes < 1024 + ## rocFFT 1.0.28 for ROCm 6.2.0 ### Optimizations diff --git a/CMakeLists.txt b/CMakeLists.txt index 691fa7a5..d25626d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # ############################################################################# -# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2016 - 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -91,7 +91,7 @@ include( ROCMClients ) include( ROCMHeaderWrapper ) # Using standardized versioning from rocm-cmake -set ( VERSION_STRING "1.0.28" ) +set ( VERSION_STRING "1.0.29" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for diff --git a/clients/tests/accuracy_test_adhoc.cpp b/clients/tests/accuracy_test_adhoc.cpp index c8d99e0d..bbb2e7ea 100644 --- a/clients/tests/accuracy_test_adhoc.cpp +++ b/clients/tests/accuracy_test_adhoc.cpp @@ -257,14 +257,19 @@ INSTANTIATE_TEST_SUITE_P(adhoc_stride, accuracy_test::TestName); auto adhoc_tokens = { - "complex_forward_len_512_64_single_ip_batch_3_istride_192_3_CI_ostride_192_3_CI_idist_1_odist_" - "1_ioffset_0_0_ooffset_0_0", - "real_forward_len_1024_1024_1024_single_op_batch_1_istride_1048576_1024_1_R_ostride_525312_513_" - "1_HI_idist_1073741824_odist_537919488_ioffset_0_0_ooffset_0_0", - "complex_forward_len_6144_single_ip_batch_34_istride_35_CI_ostride_35_CI_idist_1_odist_1_" - "ioffset_0_0_ooffset_0_0", - "real_forward_len_8192_single_ip_batch_65537_istride_1_R_ostride_1_HI_" - "idist_8194_odist_4097_ioffset_0_0_ooffset_0_0", + // clang-format off + "complex_forward_len_512_64_single_ip_batch_3_istride_192_3_CI_ostride_192_3_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", + "real_forward_len_1024_1024_1024_single_op_batch_1_istride_1048576_1024_1_R_ostride_525312_513_1_HI_idist_1073741824_odist_537919488_ioffset_0_0_ooffset_0_0", + "complex_forward_len_6144_single_ip_batch_34_istride_35_CI_ostride_35_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", + "real_forward_len_8192_single_ip_batch_65537_istride_1_R_ostride_1_HI_idist_8194_odist_4097_ioffset_0_0_ooffset_0_0", + "real_forward_len_520_single_op_batch_270400_istride_1_R_ostride_1_HI_idist_520_odist_261_ioffset_0_0_ooffset_0_0", + "real_forward_len_630_single_op_batch_396900_istride_1_R_ostride_1_HI_idist_630_odist_316_ioffset_0_0_ooffset_0_0", + "real_forward_len_660_single_op_batch_435600_istride_1_R_ostride_1_HI_idist_660_odist_331_ioffset_0_0_ooffset_0_0", + "real_forward_len_700_single_op_batch_490000_istride_1_R_ostride_1_HI_idist_700_odist_351_ioffset_0_0_ooffset_0_0", + "real_forward_len_728_single_op_batch_529984_istride_1_R_ostride_1_HI_idist_728_odist_365_ioffset_0_0_ooffset_0_0", + "real_forward_len_968_single_op_batch_937024_istride_1_R_ostride_1_HI_idist_968_odist_485_ioffset_0_0_ooffset_0_0", + "real_forward_len_1020_single_op_batch_1040400_istride_1_R_ostride_1_HI_idist_1020_odist_511_ioffset_0_0_ooffset_0_0", + // clang-format on }; INSTANTIATE_TEST_SUITE_P(adhoc_token, diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 8a51dba7..52af7e98 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "rocFFT" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = v1.0.28 +PROJECT_NUMBER = v1.0.29 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/library/src/device/kernel-generator.py b/library/src/device/kernel-generator.py index 8a3976fd..8bd2d50d 100644 --- a/library/src/device/kernel-generator.py +++ b/library/src/device/kernel-generator.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -307,64 +307,217 @@ def list_small_kernels(): NS(length= 125, workgroup_size=256, threads_per_transform= 25, factors=(5, 5, 5), half_lds=False, direct_to_from_reg=False), NS(length= 126, workgroup_size= 256, threads_per_transform= 42, factors=(6, 7, 3), half_lds=False, runtime_compile=True), NS(length= 128, workgroup_size=256, threads_per_transform= 16, factors=(16, 8)), + NS(length= 130, workgroup_size= 64, threads_per_transform= 13, factors=(13, 10), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 132, workgroup_size=128, threads_per_transform= 22, factors=(11, 6, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 135, workgroup_size=128, threads_per_transform= 9, factors=(5, 3, 3, 3), runtime_compile=True), + NS(length= 136, workgroup_size=128, threads_per_transform=17, factors=(17, 8), runtime_compile=True), + NS(length= 140, workgroup_size= 64, threads_per_transform= 28, factors=(7, 5, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 143, workgroup_size=256, threads_per_transform= 13, factors=(13, 11), half_lds=False, runtime_compile=True), NS(length= 144, workgroup_size=128, threads_per_transform= 12, factors=(6, 6, 4)), + NS(length= 147, workgroup_size= 64, threads_per_transform= 21, factors=(7, 7, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 150, workgroup_size= 64, threads_per_transform= 5, factors=(10, 5, 3), runtime_compile=True), + NS(length= 153, workgroup_size=128, threads_per_transform= 17, factors=(17, 9), runtime_compile=True), + NS(length= 154, workgroup_size=128, threads_per_transform= 22, factors=(11, 7, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 156, workgroup_size= 128, threads_per_transform=13, factors=(3, 4, 13), half_lds=False, runtime_compile=True), NS(length= 160, workgroup_size=256, threads_per_transform= 16, factors=(16, 10)), NS(length= 162, workgroup_size=256, threads_per_transform= 27, factors=(6, 3, 3, 3), runtime_compile=True), + NS(length= 165, workgroup_size= 64, threads_per_transform= 11, factors=(11, 5, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 168, workgroup_size=256, threads_per_transform= 56, factors=(8, 7, 3), half_lds=False, direct_to_from_reg=False), NS(length= 169, workgroup_size=256, threads_per_transform= 13, factors=(13, 13), runtime_compile=True), + NS(length= 170, workgroup_size=128, threads_per_transform= 17, factors=(17, 10), runtime_compile=True), + NS(length= 175, workgroup_size=256, threads_per_transform= 35, factors=(5, 5, 7), half_lds=False, runtime_compile=True), NS(length= 176, workgroup_size= 64, threads_per_transform= 16, factors=(11, 16), runtime_compile=True), NS(length= 180, workgroup_size=256, threads_per_transform= 60, factors=(10, 6, 3), half_lds=False, direct_to_from_reg=False), + NS(length= 182, workgroup_size= 64, threads_per_transform= 13, factors=(13, 2, 7), half_lds=False, runtime_compile=True), + NS(length= 187, workgroup_size=128, threads_per_transform= 17, factors=(17, 11), runtime_compile=True), + NS(length= 189, workgroup_size= 64, threads_per_transform= 21, factors=(7, 3, 3, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 192, workgroup_size=128, threads_per_transform= 16, factors=(6, 4, 4, 2)), + NS(length= 195, workgroup_size= 64, threads_per_transform= 13, factors=(13, 5, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 196, workgroup_size= 64, threads_per_transform= 28, factors=(4, 7, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 198, workgroup_size=128, threads_per_transform= 22, factors=(11, 2, 9), half_lds=False, runtime_compile=True), NS(length= 200, workgroup_size= 64, threads_per_transform= 20, factors=(10, 10, 2)), + NS(length= 204, workgroup_size=128, threads_per_transform= 17, factors=(17, 4, 3), runtime_compile=True), NS(length= 208, workgroup_size= 64, threads_per_transform= 16, factors=(13, 16)), + NS(length= 210, workgroup_size= 64, threads_per_transform= 30, factors=(10, 7, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 216, workgroup_size=256, threads_per_transform= 36, factors=(6, 6, 6)), + NS(length= 220, workgroup_size=128, threads_per_transform= 22, factors=(10, 2, 11), half_lds=False, runtime_compile=True), + NS(length= 221, workgroup_size=128, threads_per_transform= 17, factors=(17, 13), runtime_compile=True), NS(length= 224, workgroup_size= 64, threads_per_transform= 16, factors=(7, 2, 2, 2, 2, 2)), NS(length= 225, workgroup_size=256, threads_per_transform= 75, factors=(5, 5, 3, 3), runtime_compile=True), + NS(length= 231, workgroup_size=256, threads_per_transform= 33, factors=(11, 7, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 234, workgroup_size= 64, threads_per_transform= 26, factors=(13, 9, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 238, workgroup_size= 64, threads_per_transform= 17, factors=(17, 7, 2), runtime_compile=True), NS(length= 240, workgroup_size=128, threads_per_transform= 48, factors=(8, 5, 6)), + NS(length= 242, workgroup_size=128, threads_per_transform= 22, factors=(11, 2, 11), half_lds=False, runtime_compile=True), NS(length= 243, workgroup_size=256, threads_per_transform= 81, factors=(3, 3, 3, 3, 3)), + NS(length= 245, workgroup_size=256, threads_per_transform= 35, factors=(7, 5, 7), half_lds=False, runtime_compile=True), NS(length= 250, workgroup_size=128, threads_per_transform= 25, factors=(10, 5, 5), runtime_compile=True), + NS(length= 252, workgroup_size= 64, threads_per_transform= 63, factors=(7, 3, 3, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 255, workgroup_size= 64, threads_per_transform= 17, factors=(17, 5, 3), runtime_compile=True), NS(length= 256, workgroup_size= 64, threads_per_transform= 64, factors=(4, 4, 4, 4)), + NS(length= 260, workgroup_size= 64, threads_per_transform= 26, factors=(13, 10, 2), half_lds=False, runtime_compile=True), + NS(length= 264, workgroup_size=256, threads_per_transform= 33, factors=(8, 3, 11), half_lds=False, runtime_compile=True), NS(length= 270, workgroup_size=128, threads_per_transform= 27, factors=(10, 3, 3, 3)), NS(length= 272, workgroup_size=128, threads_per_transform= 17, factors=(16, 17), runtime_compile=True), + NS(length= 273, workgroup_size= 64, threads_per_transform= 13, factors=(13, 3, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 275, workgroup_size= 64, threads_per_transform= 55, factors=(11, 5, 5), half_lds=False, runtime_compile=True), + NS(length= 280, workgroup_size= 64, threads_per_transform= 56, factors=(8, 7, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 286, workgroup_size= 64, threads_per_transform= 26, factors=(13, 11, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 288, workgroup_size=128, threads_per_transform= 24, factors=(6, 6, 4, 2), runtime_compile=True), NS(length= 289, workgroup_size=128, threads_per_transform= 17, factors=(17, 17), runtime_compile=True), + NS(length= 294, workgroup_size=128, threads_per_transform= 42, factors=(6, 7, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 297, workgroup_size=256, threads_per_transform= 33, factors=(9, 3, 11), runtime_compile=True), NS(length= 300, workgroup_size= 64, threads_per_transform= 30, factors=(10, 10, 3), runtime_compile=True), + NS(length= 306, workgroup_size=256, threads_per_transform= 34, factors=(17, 2, 9), runtime_compile=True), + NS(length= 308, workgroup_size= 64, threads_per_transform= 44, factors=(11, 7, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 312, workgroup_size= 64, threads_per_transform= 26, factors=(13, 4, 3, 2), half_lds=False, runtime_compile=True), + NS(length= 315, workgroup_size= 64, threads_per_transform= 63, factors=(7, 3, 3, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 320, workgroup_size= 64, threads_per_transform= 16, factors=(10, 4, 4, 2), runtime_compile=True), NS(length= 324, workgroup_size= 64, threads_per_transform= 54, factors=(3, 6, 6, 3), runtime_compile=True), + NS(length= 325, workgroup_size= 64, threads_per_transform= 13, factors=(13, 5, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 330, workgroup_size=128, threads_per_transform= 33, factors=(11, 10, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 336, workgroup_size=128, threads_per_transform= 56, factors=(8, 7, 6)), + NS(length= 338, workgroup_size= 64, threads_per_transform= 26, factors=(13, 2, 13), runtime_compile=True), + NS(length= 340, workgroup_size=128, threads_per_transform= 34, factors=(17, 2, 10), runtime_compile=True), NS(length= 343, workgroup_size=256, threads_per_transform= 49, factors=(7, 7, 7), runtime_compile=True), + NS(length= 350, workgroup_size= 64, threads_per_transform= 50, factors=(5, 7, 10), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 351, workgroup_size=128, threads_per_transform= 39, factors=(13, 3, 9), half_lds=False, runtime_compile=True), + NS(length= 352, workgroup_size= 64, threads_per_transform= 32, factors=(11, 2, 16), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 357, workgroup_size=256, threads_per_transform= 17, factors=(17, 3, 7), runtime_compile=True), NS(length= 360, workgroup_size=256, threads_per_transform= 60, factors=(10, 6, 6), runtime_compile=True), + NS(length= 363, workgroup_size=128, threads_per_transform= 33, factors=(11, 3, 11), runtime_compile=True), + NS(length= 364, workgroup_size= 64, threads_per_transform= 52, factors=(13, 7, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 374, workgroup_size=256, threads_per_transform= 34, factors=(17, 2, 11), runtime_compile=True), NS(length= 375, workgroup_size=128, threads_per_transform= 25, factors=(5, 5, 5, 3), runtime_compile=True), + NS(length= 378, workgroup_size=128, threads_per_transform=126, factors=(6, 3, 3, 7), half_lds=False, runtime_compile=True), NS(length= 384, workgroup_size=128, threads_per_transform= 32, factors=(6, 4, 4, 4), runtime_compile=True), + NS(length= 385, workgroup_size= 64, threads_per_transform= 55, factors=(11, 7, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 390, workgroup_size=128, threads_per_transform= 39, factors=(13, 3, 10), half_lds=False, runtime_compile=True), + NS(length= 392, workgroup_size= 64, threads_per_transform= 56, factors=(8, 7, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 396, workgroup_size= 64, threads_per_transform= 44, factors=(11, 9, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 400, workgroup_size=128, threads_per_transform= 40, factors=(4, 10, 10), runtime_compile=True), NS(length= 405, workgroup_size=128, threads_per_transform= 27, factors=(5, 3, 3, 3, 3), runtime_compile=True), + NS(length= 408, workgroup_size= 64, threads_per_transform= 17, factors=(17, 3, 8), runtime_compile=True), + NS(length= 416, workgroup_size= 64, threads_per_transform= 32, factors=(13, 2, 16), half_lds=False, runtime_compile=True), + NS(length= 420, workgroup_size= 64, threads_per_transform= 60, factors=(10, 7, 6), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 425, workgroup_size= 64, threads_per_transform= 17, factors=(17, 5, 5), runtime_compile=True), + NS(length= 429, workgroup_size=128, threads_per_transform= 39, factors=(13, 3, 11), half_lds=False, runtime_compile=True), NS(length= 432, workgroup_size= 64, threads_per_transform= 27, factors=(3, 16, 3, 3), runtime_compile=True), + NS(length= 440, workgroup_size= 64, threads_per_transform= 55, factors=(11, 8, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 441, workgroup_size= 64, threads_per_transform= 63, factors=(9, 7, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 442, workgroup_size=256, threads_per_transform= 34, factors=(17, 2, 13), runtime_compile=True), + NS(length= 448, workgroup_size=128, threads_per_transform= 64, factors=(8, 7, 8), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 450, workgroup_size=128, threads_per_transform= 30, factors=(10, 5, 3, 3), runtime_compile=True), + NS(length= 455, workgroup_size=256, threads_per_transform= 65, factors=(13, 5, 7), half_lds=False, runtime_compile=True), + NS(length= 459, workgroup_size=256, threads_per_transform= 51, factors=(17, 3, 9), runtime_compile=True), + NS(length= 462, workgroup_size=256, threads_per_transform= 77, factors=(11, 6, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 468, workgroup_size= 64, threads_per_transform= 52, factors=(13, 9, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 476, workgroup_size=128, threads_per_transform= 34, factors=(17, 2, 7, 2), runtime_compile=True), NS(length= 480, workgroup_size= 64, threads_per_transform= 16, factors=(10, 8, 6), runtime_compile=True), + NS(length= 484, workgroup_size= 64, threads_per_transform= 44, factors=(4, 11, 11), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 486, workgroup_size=256, threads_per_transform=162, factors=(6, 3, 3, 3, 3), runtime_compile=True), + NS(length= 490, workgroup_size=256, threads_per_transform= 70, factors=(10, 7, 7), half_lds=False, runtime_compile=True), + NS(length= 495, workgroup_size= 64, threads_per_transform= 55, factors=(11, 9, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 500, workgroup_size=128, threads_per_transform=100, factors=(10, 5, 10), runtime_compile=True), + NS(length= 504, workgroup_size= 64, threads_per_transform= 63, factors=(7, 9, 4, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 507, workgroup_size=128, threads_per_transform= 39, factors=(13, 3, 13), runtime_compile=True), + NS(length= 510, workgroup_size=256, threads_per_transform= 34, factors=(17, 2, 3, 5), runtime_compile=True), NS(length= 512, workgroup_size= 64, threads_per_transform= 64, factors=(8, 8, 8)), + NS(length= 520, workgroup_size= 64, threads_per_transform= 52, factors=(13, 10, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 525, workgroup_size= 128, threads_per_transform=105, factors=(7, 3, 5, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 528, workgroup_size= 64, threads_per_transform= 48, factors=(4, 4, 3, 11), runtime_compile=True), + NS(length= 539, workgroup_size=256, threads_per_transform= 77, factors=(11, 7, 7), runtime_compile=True), NS(length= 540, workgroup_size=256, threads_per_transform= 54, factors=(3, 10, 6, 3), runtime_compile=True), + NS(length= 544, workgroup_size=128, threads_per_transform= 34, factors=(17, 2, 16), runtime_compile=True), + NS(length= 546, workgroup_size=128, threads_per_transform= 39, factors=(13, 3, 7, 2), runtime_compile=True), + NS(length= 550, workgroup_size= 64, threads_per_transform= 55, factors=(11, 10, 5), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 560, workgroup_size= 64, threads_per_transform= 56, factors=(8, 7, 5, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 561, workgroup_size=256, threads_per_transform= 51, factors=(17, 3, 11), runtime_compile=True), + NS(length= 567, workgroup_size= 64, threads_per_transform= 63, factors=(7, 9, 3, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 572, workgroup_size= 64, threads_per_transform= 52, factors=(13, 11, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 576, workgroup_size=128, threads_per_transform= 96, factors=(16, 6, 6), runtime_compile=True), + NS(length= 578, workgroup_size= 256, threads_per_transform=34, factors=(17, 17, 2), runtime_compile=True), + NS(length= 585, workgroup_size= 256, threads_per_transform=65, factors=(13, 5, 9), half_lds=False, runtime_compile=True), + NS(length= 588, workgroup_size= 256, threads_per_transform=84, factors=(7, 3, 4, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 594, workgroup_size=128, threads_per_transform= 99, factors=(11, 3, 6, 3), half_lds=False, runtime_compile=True), + NS(length= 595, workgroup_size= 64, threads_per_transform= 17, factors=(7, 17, 5), runtime_compile=True), NS(length= 600, workgroup_size= 64, threads_per_transform= 60, factors=(10, 6, 10), runtime_compile=True), + NS(length= 605, workgroup_size= 64, threads_per_transform= 55, factors=(11, 5, 11), half_lds=False, runtime_compile=True), + NS(length= 612, workgroup_size= 64, threads_per_transform= 51, factors=(17, 3, 6, 2), runtime_compile=True), + NS(length= 616, workgroup_size=128, threads_per_transform= 88, factors=(11, 7, 8), half_lds=False, runtime_compile=True), + NS(length= 624, workgroup_size= 64, threads_per_transform= 52, factors=(13, 4, 6, 2), half_lds=False, runtime_compile=True), NS(length= 625, workgroup_size=128, threads_per_transform=125, factors=(5, 5, 5, 5), runtime_compile=True), + NS(length= 630, workgroup_size= 64, threads_per_transform= 63, factors=(3, 3, 5, 7, 2), runtime_compile=True), + NS(length= 637, workgroup_size=128, threads_per_transform= 91, factors=(13, 7, 7), runtime_compile=True), NS(length= 640, workgroup_size=128, threads_per_transform= 64, factors=(8, 10, 8), runtime_compile=True), NS(length= 648, workgroup_size=256, threads_per_transform=216, factors=(8, 3, 3, 3, 3), runtime_compile=True), + NS(length= 650, workgroup_size= 256, threads_per_transform=65, factors=(10, 5, 13), half_lds=False, runtime_compile=True), + NS(length= 660, workgroup_size=128, threads_per_transform=110, factors=(11, 6, 10), runtime_compile=True), + NS(length= 663, workgroup_size= 64, threads_per_transform= 51, factors=(17, 13, 3), half_lds=False, runtime_compile=True), + NS(length= 672, workgroup_size= 64, threads_per_transform= 56, factors=(2, 2, 2, 2, 2, 3, 7), runtime_compile=True), NS(length= 675, workgroup_size=256, threads_per_transform=225, factors=(5, 5, 3, 3, 3), runtime_compile=True), + NS(length= 676, workgroup_size= 64, threads_per_transform= 52, factors=(13, 13, 4), half_lds=False, runtime_compile=True), + NS(length= 680, workgroup_size=256, threads_per_transform= 68, factors=(17, 4, 10), runtime_compile=True), + NS(length= 686, workgroup_size= 64, threads_per_transform= 49, factors=(7, 7, 7, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 693, workgroup_size=128, threads_per_transform= 99, factors=(11, 7, 9), runtime_compile=True), + NS(length= 700, workgroup_size= 128, threads_per_transform=100, factors=(10, 7, 10), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 702, workgroup_size= 128, threads_per_transform=117, factors=(13, 3, 6, 3), runtime_compile=True), + NS(length= 704, workgroup_size=256, threads_per_transform=88, factors=(2, 2, 2, 2, 11, 2, 2), runtime_compile=True), + NS(length= 714, workgroup_size=64, threads_per_transform=51, factors=(3, 17, 7, 2), runtime_compile=True), + NS(length= 715, workgroup_size=256, threads_per_transform= 65, factors=(13, 5, 11), runtime_compile=True), NS(length= 720, workgroup_size=256, threads_per_transform=120, factors=(10, 3, 8, 3), runtime_compile=True), + NS(length= 726, workgroup_size=256, threads_per_transform= 66, factors=(11, 6, 11), half_lds=False, runtime_compile=True), + NS(length= 728, workgroup_size=128, threads_per_transform=104, factors=(13, 7, 8), runtime_compile=True), NS(length= 729, workgroup_size=256, threads_per_transform=243, factors=(3, 3, 3, 3, 3, 3), runtime_compile=True), + NS(length= 735, workgroup_size= 256, threads_per_transform=147, factors=(7, 3, 5, 7), half_lds=False, runtime_compile=True), + NS(length= 748, workgroup_size= 256, threads_per_transform=68, factors=(17, 4, 11), runtime_compile=True), NS(length= 750, workgroup_size=256, threads_per_transform=250, factors=(10, 5, 3, 5), runtime_compile=True), + NS(length= 756, workgroup_size= 64, threads_per_transform= 63, factors=(2, 2, 3, 3, 3, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 765, workgroup_size=256, threads_per_transform= 51, factors=(17, 3, 5, 3), runtime_compile=True), NS(length= 768, workgroup_size= 64, threads_per_transform= 48, factors=(16, 3, 16), runtime_compile=True), + NS(length= 770, workgroup_size=256, threads_per_transform=110, factors=(11, 10, 7), half_lds=False, runtime_compile=True), + NS(length= 780, workgroup_size=256, threads_per_transform= 78, factors=(2, 3, 13, 5, 2), runtime_compile=True), + NS(length= 784, workgroup_size= 64, threads_per_transform= 56, factors=(2, 2, 2, 2, 7, 7), runtime_compile=True), + NS(length= 792, workgroup_size=256, threads_per_transform= 88, factors=(2, 2, 2, 3, 3, 11), half_lds=False, runtime_compile=True), NS(length= 800, workgroup_size=256, threads_per_transform=160, factors=(16, 5, 10), runtime_compile=True), NS(length= 810, workgroup_size=128, threads_per_transform= 81, factors=(3, 10, 3, 3, 3), runtime_compile=True), + NS(length= 816, workgroup_size= 64, threads_per_transform= 51, factors=(17, 2, 3, 2, 2, 2), runtime_compile=True), + NS(length= 819, workgroup_size=128, threads_per_transform=117, factors=(9, 7, 13), half_lds=False, runtime_compile=True), + NS(length= 825, workgroup_size= 64, threads_per_transform= 55, factors=(11, 5, 5, 3), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 832, workgroup_size=128, threads_per_transform=104, factors=(13, 2, 2, 2, 2, 2, 2), runtime_compile=True), + NS(length= 833, workgroup_size=128, threads_per_transform=119, factors=(17, 7, 7), runtime_compile=True), + NS(length= 840, workgroup_size= 64, threads_per_transform= 56, factors=(2, 2, 2, 3, 5, 7), runtime_compile=True), + NS(length= 845, workgroup_size= 256, threads_per_transform=65, factors=(13, 5, 13), runtime_compile=True), + NS(length= 847, workgroup_size= 256, threads_per_transform=77, factors=(11, 7, 11), runtime_compile=True), + NS(length= 850, workgroup_size= 128, threads_per_transform=85, factors=(10, 5, 17), half_lds=False, runtime_compile=True), + NS(length= 858, workgroup_size= 256, threads_per_transform=78, factors=(13, 11, 6), runtime_compile=True), NS(length= 864, workgroup_size= 64, threads_per_transform= 54, factors=(3, 6, 16, 3), runtime_compile=True), + NS(length= 867, workgroup_size= 64, threads_per_transform=51, factors=(17, 17, 3), runtime_compile=True), + NS(length= 875, workgroup_size= 256, threads_per_transform=175, factors=(7, 5, 5, 5), half_lds=False, runtime_compile=True), + NS(length= 880, workgroup_size=256, threads_per_transform= 88, factors=(2, 2, 2, 2, 11, 5), runtime_compile=True), + NS(length= 882, workgroup_size= 64, threads_per_transform=63, factors=(9, 7, 7, 2), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 884, workgroup_size= 256, threads_per_transform=68, factors=(13, 4, 17), runtime_compile=True), + NS(length= 891, workgroup_size= 256, threads_per_transform=99, factors=(9, 11, 3, 3), runtime_compile=True), + NS(length= 896, workgroup_size=128, threads_per_transform=112, factors=(2, 2, 2, 2, 2, 2, 2, 7), half_lds=False, direct_to_from_reg=False, runtime_compile=True), NS(length= 900, workgroup_size=256, threads_per_transform= 90, factors=(10, 10, 3, 3), runtime_compile=True), + NS(length= 910, workgroup_size=256, threads_per_transform= 91, factors=(13, 2, 7, 5), half_lds=False, runtime_compile=True), + NS(length= 918, workgroup_size=128, threads_per_transform=102, factors=(17, 9, 2, 3), runtime_compile=True), + NS(length= 924, workgroup_size= 64, threads_per_transform= 44, factors=(2, 2, 3, 7, 11), runtime_compile=True), + NS(length= 935, workgroup_size= 256, threads_per_transform= 85, factors=(17, 11, 5), runtime_compile=True), + NS(length= 936, workgroup_size=256, threads_per_transform= 78, factors=(2, 2, 13, 2, 3, 3), runtime_compile=True), + NS(length= 945, workgroup_size= 64, threads_per_transform= 63, factors=(3, 3, 3, 5, 7), runtime_compile=True), + NS(length= 952, workgroup_size=256, threads_per_transform= 68, factors=(17, 4, 2, 7), runtime_compile=True), NS(length= 960, workgroup_size=256, threads_per_transform=160, factors=(16, 10, 6), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 968, workgroup_size=256, threads_per_transform= 88, factors=(2, 2, 2, 11, 11), half_lds=False, runtime_compile=True), NS(length= 972, workgroup_size=256, threads_per_transform=162, factors=(3, 6, 3, 6, 3), runtime_compile=True), + NS(length= 975, workgroup_size=128, threads_per_transform= 39, factors=(13, 5, 3, 5), runtime_compile=True), + NS(length= 980, workgroup_size= 256, threads_per_transform=196, factors=(7, 5, 7, 4), half_lds=False, direct_to_from_reg=False, runtime_compile=True), + NS(length= 990, workgroup_size=128, threads_per_transform=110, factors=(2, 3, 3, 5, 11), half_lds=False, runtime_compile=True), NS(length=1000, workgroup_size=128, threads_per_transform=100, factors=(10, 10, 10), runtime_compile=True), + NS(length=1001, workgroup_size=256, threads_per_transform= 91, factors=(13, 7, 11), runtime_compile=True), + NS(length=1008, workgroup_size= 64, threads_per_transform= 56, factors=(2, 2, 2, 2, 3, 3, 7), runtime_compile=True), + NS(length=1014, workgroup_size=256, threads_per_transform= 78, factors=(13, 6, 13), half_lds=False, runtime_compile=True), + NS(length=1020, workgroup_size=256, threads_per_transform= 68, factors=(2, 17, 2, 3, 5), runtime_compile=True), NS(length=1024, workgroup_size=128, threads_per_transform=128, factors=(8, 8, 4, 4)), NS(length=1040, workgroup_size=256, threads_per_transform=208, factors=(13, 16, 5), runtime_compile=True), NS(length=1080, workgroup_size=256, threads_per_transform=108, factors=(6, 10, 6, 3), runtime_compile=True), @@ -884,9 +1037,6 @@ def cli(): parser.add_argument('--manual-large', type=str, help='Large kernel sizes to generate.') - parser.add_argument('--runtime-compile', - type=str, - help='Allow runtime-compiled kernels.') parser.add_argument('--runtime-compile-default', type=str, help='Compile kernels at runtime by default.') @@ -975,9 +1125,6 @@ def cli(): kernels = default_runtime_compile(kernels, args.runtime_compile_default == 'ON') - if args.runtime_compile != 'ON': - for k in kernels: - k.runtime_compile = False # # sub commands