forked from graphcore/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdynamic_sparse_fc.py
202 lines (167 loc) · 7.65 KB
/
dynamic_sparse_fc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
"""
Benchmark a single dynamic sparse FC layer.
The figures reported at the end of the benchmark are based on wall time.
Run with -h or --help for options.
"""
import inspect
import os
import sys
import tensorflow.compat.v1 as tf
import numpy as np
from functools import partial
import logging
cwd = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, os.path.join(cwd, '..', '..', '..', 'applications',
'tensorflow', 'dynamic_sparsity'))
from ipu_sparse_ops import sparse, layers, optimizers # noqa
tf.disable_eager_execution()
tf.disable_v2_behavior()
def add_args(parser):
parser.add_argument("--batch-size", type=int,
help="Number of vectors in a mini-batch")
parser.add_argument("--input-size", type=int,
help="Input size (rows of sparse weight matrix)")
parser.add_argument("--output-size", type=int,
help="Output size (cols of sparse weight matrix)")
parser.add_argument("--density", type=float,
help="Non-zero density (e.g. 0.1 => 10 percent of elements are non-zero).")
parser.add_argument("--np-seed", type=int,
help="Set the random seed for numpy.")
parser.add_argument("--train", action='store_true', dest='train',
help="Compute loss and optimization pass")
parser.set_defaults(batch_size=32, input_size=1024, output_size=1024,
density=0.1, np_seed=42, batches_per_step=5000, steps=5)
parser.add_argument('--meta-info-oversize', default=0.2, type=float,
help="Sets the Popsparse matmul option 'metaInfoBucketOversizeProportion'.")
parser.add_argument("--data-type", type=str,
help="Choose the floating point type for the weights.",
choices=['fp32', 'fp16'])
return parser
def get_matmul_options(args):
do_grads = "true" if args.train else "false"
matmul_options = {
"availableMemoryProportion": 1.0,
"doGradAPass": do_grads,
"doGradWPass": do_grads,
"metaInfoBucketOversizeProportion": args.meta_info_oversize
}
return matmul_options
def make_random_sparse_fc_layer(args):
num_groups = 1
topk_ratio = 0.5
random_gen = np.random.default_rng(seed=args.np_seed)
weights_type = tf.float16 if args.data_type == 'fp16' else tf.float32
if args.train:
fc = layers.SparseFcLayer.from_random_generator(
args.output_size, [args.batch_size, args.input_size], args.density,
random_gen.standard_normal, random_gen,
matmul_options=get_matmul_options(args),
name='sparse_fc_benchmark',
bias=False, relu=False, disable_updating=not args.use_generated_data,
dtype=weights_type)
else:
fc = layers.SparseFcLayer.from_random_generator(
args.output_size, [args.batch_size, args.input_size], args.density,
random_gen.standard_normal, random_gen,
matmul_options=get_matmul_options(args),
name='sparse_fc_benchmark',
bias=False, relu=False, disable_updating=not args.use_generated_data,
dtype=weights_type)
return fc
def inputs(opts, index):
weights_type = tf.float16 if opts.data_type == 'fp16' else tf.float32
value = tf.cast(index, weights_type)
inputs = tf.broadcast_to(value, [opts.batch_size, opts.input_size])
return {
"inputs": inputs,
"cond": tf.constant(False)
}
def graph_builder(layers, opts, inputs):
layers['fc'] = layers['fc_gen']()
if opts.train:
# Need to check if this is the last iteration of the loop
with tf.variable_scope("counter", reuse=tf.AUTO_REUSE, use_resource=True):
itr_counter = tf.get_variable("iterations", shape=[], dtype=tf.int32,
initializer=tf.zeros_initializer())
mod_itrs = tf.math.floormod(itr_counter, opts.batches_per_step)
last_itr = tf.equal(mod_itrs, 0)
inc = tf.assign_add(itr_counter, 1)
z = layers['fc'](inputs["inputs"], last_itr)
# Loss is the mean across output matrix:
loss = tf.reduce_mean(z)
with tf.variable_scope("train", reuse=tf.AUTO_REUSE, use_resource=True):
# We need to ensure that the train op is executed as part of
# the benchmarking loop by maintaining a step variable and
# forcing a control dependency between it and the train op:
global_step = tf.get_variable(
"step_control", dtype=tf.int32, shape=[])
optimiser = optimizers.SparseOptimizer(tf.train.MomentumOptimizer)(
learning_rate=0.01, momentum=0.0001, use_nesterov=True, name='optimise',
sparse_layers=[layers['fc']])
with tf.control_dependencies([global_step]):
train_op = optimiser.minimize(loss)
all_ops = tf.group(inc, train_op)
with tf.control_dependencies([all_ops]):
global_step = tf.identity(global_step)
return global_step
else:
return layers['fc'](inputs["inputs"], inputs["cond"])
def initializer_sess(layers, utils, sess):
with tf.device("cpu"):
layers["fc"].create_placeholders()
def initializer():
return tf.global_variables_initializer()
def iteration_report(opts, time):
items_per_sec = opts.batch_size * opts.batches_per_step * opts.replicas / time
tflops_per_sec = items_per_sec * 2 * opts.input_size * opts.output_size * opts.density * 1e-12
return f"{items_per_sec:5f} items/sec {tflops_per_sec:.2f} TFLOPS/sec"
if __name__ == '__main__':
# Add benchmark module to path
cwd = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))
sys.path.insert(1, os.path.join(cwd, '..', '..', '..', 'utils',
'benchmarks', 'tensorflow'))
import benchmark
logging.basicConfig(
level=logging.getLevelName("DEBUG"),
format='%(asctime)s %(name)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
layer_dict = {}
utils = {"train": False}
module = benchmark.Benchmark(
partial(graph_builder, layer_dict),
inputs,
initializer,
add_args,
iteration_report,
partial(initializer_sess, layer_dict, utils),
)
# Make the sparse FC layer and insert into the layer dictionary
# for use by the builder functions:
options = benchmark.parse_opts(module, False)
np.random.seed(options.np_seed)
layer_dict['fc_gen'] = lambda: make_random_sparse_fc_layer(options)
if options.train:
utils["train"] = True
fc = layer_dict['fc_gen']()
print(f"Metainfo bytes: {fc.weights.representation.metainfo_state.size * 2}")
print(f"Non-zeros bytes: {fc.weights.representation.nz_values.size * 4}")
if options.shards > 1:
raise NotImplementedError(
"--shards option has not been implemented with this example")
if options.replicas > 1:
raise NotImplementedError(
"--replicas option has not been implemented with this example")
# Log Benchmark Message
print(" Dynamic Sparse FC Layer {} Synthetic benchmark.\n"
" Batch size {}.\n"
" Batches per Step {}.\n"
" Equivalent dense size {}x{}.\n"
f" Density {options.density} (sparsity {1-options.density}).\n"
.format(
"Inference",
options.batch_size,
options.batches_per_step if not options.report else "n/a",
options.input_size, options.output_size))
benchmark.run(module, options)