-
Notifications
You must be signed in to change notification settings - Fork 0
/
mediapipe_utils.py
1016 lines (914 loc) · 45.3 KB
/
mediapipe_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# mediapipe utils from https://github.com/geaxgx/depthai_hand_tracker
import cv2
import numpy as np
from collections import namedtuple
from math import ceil, sqrt, exp, pi, floor, sin, cos, atan2, gcd
# To not display: RuntimeWarning: overflow encountered in exp
# in line: scores = 1 / (1 + np.exp(-scores))
np.seterr(over='ignore')
class HandRegion:
"""
Attributes:
pd_score : detection score
pd_box : detection box [x, y, w, h], normalized [0,1] in the squared image
pd_kps : detection keypoints coordinates [x, y], normalized [0,1] in the squared image
rect_x_center, rect_y_center : center coordinates of the rotated bounding rectangle, normalized [0,1] in the squared image
rect_w, rect_h : width and height of the rotated bounding rectangle, normalized in the squared image (may be > 1)
rotation : rotation angle of rotated bounding rectangle with y-axis in radian
rect_x_center_a, rect_y_center_a : center coordinates of the rotated bounding rectangle, in pixels in the squared image
rect_w, rect_h : width and height of the rotated bounding rectangle, in pixels in the squared image
rect_points : list of the 4 points coordinates of the rotated bounding rectangle, in pixels
expressed in the squared image during processing,
expressed in the source rectangular image when returned to the user
lm_score: global landmark score
norm_landmarks : 3D landmarks coordinates in the rotated bounding rectangle, normalized [0,1]
landmarks : 2D landmark coordinates in pixel in the source rectangular image
world_landmarks : 3D landmark coordinates in meter
handedness: float between 0. and 1., > 0.5 for right hand, < 0.5 for left hand,
label: "left" or "right", handedness translated in a string,
xyz: real 3D world coordinates of the wrist landmark, or of the palm center (if landmarks are not used),
xyz_zone: (left, top, right, bottom), pixel coordinates in the source rectangular image
of the rectangular zone used to estimate the depth
gesture: (optional, set in recognize_gesture() when use_gesture==True) string corresponding to recognized gesture ("ONE","TWO","THREE","FOUR","FIVE","FIST","OK","PEACE")
or None if no gesture has been recognized
"""
def __init__(self, pd_score=None, pd_box=None, pd_kps=None):
self.pd_score = pd_score # Palm detection score
self.pd_box = pd_box # Palm detection box [x, y, w, h] normalized
self.pd_kps = pd_kps # Palm detection keypoints
self.rect_x_center_a = None
self.rect_y_center_a = None
self.rect_w_a = None
self.rotation = None
self.rect_points = None
self.lm_score = None
self.handedness = None
self.label = None
self.norm_landmarks = None
self.landmarks = None
def get_rotated_world_landmarks(self):
world_landmarks_rotated = self.world_landmarks.copy()
sin_rot = sin(self.rotation)
cos_rot = cos(self.rotation)
rot_m = np.array([[cos_rot, sin_rot], [-sin_rot, cos_rot]])
world_landmarks_rotated[:, :2] = np.dot(world_landmarks_rotated[:, :2], rot_m)
return world_landmarks_rotated
def print(self):
attrs = vars(self)
print('\n'.join("%s: %s" % item for item in attrs.items()))
class HandednessAverage:
"""
Used to store the average handeness
Why ? Handedness inferred by the landmark model is not perfect. For certain poses, it is not rare that the model thinks
that a right hand is a left hand (or vice versa). Instead of using the last inferred handedness, we prefer to use the average
of the inferred handedness on the last frames. This gives more robustness.
"""
def __init__(self):
self._total_handedness = 0
self._nb = 0
def update(self, new_handedness):
self._total_handedness += new_handedness
self._nb += 1
return self._total_handedness / self._nb
def reset(self):
self._total_handedness = self._nb = 0
SSDAnchorOptions = namedtuple('SSDAnchorOptions', [
'num_layers',
'min_scale',
'max_scale',
'input_size_height',
'input_size_width',
'anchor_offset_x',
'anchor_offset_y',
'strides',
'aspect_ratios',
'reduce_boxes_in_lowest_layer',
'interpolated_scale_aspect_ratio',
'fixed_anchor_size'])
def calculate_scale(min_scale, max_scale, stride_index, num_strides):
if num_strides == 1:
return (min_scale + max_scale) / 2
else:
return min_scale + (max_scale - min_scale) * stride_index / (num_strides - 1)
def generate_anchors(options):
"""
option : SSDAnchorOptions
# https://github.com/google/mediapipe/blob/master/mediapipe/calculators/tflite/ssd_anchors_calculator.cc
"""
anchors = []
layer_id = 0
n_strides = len(options.strides)
while layer_id < n_strides:
anchor_height = []
anchor_width = []
aspect_ratios = []
scales = []
# For same strides, we merge the anchors in the same order.
last_same_stride_layer = layer_id
while last_same_stride_layer < n_strides and \
options.strides[last_same_stride_layer] == options.strides[layer_id]:
scale = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer, n_strides)
if last_same_stride_layer == 0 and options.reduce_boxes_in_lowest_layer:
# For first layer, it can be specified to use predefined anchors.
aspect_ratios += [1.0, 2.0, 0.5]
scales += [0.1, scale, scale]
else:
aspect_ratios += options.aspect_ratios
scales += [scale] * len(options.aspect_ratios)
if options.interpolated_scale_aspect_ratio > 0:
if last_same_stride_layer == n_strides - 1:
scale_next = 1.0
else:
scale_next = calculate_scale(options.min_scale, options.max_scale, last_same_stride_layer + 1,
n_strides)
scales.append(sqrt(scale * scale_next))
aspect_ratios.append(options.interpolated_scale_aspect_ratio)
last_same_stride_layer += 1
for i, r in enumerate(aspect_ratios):
ratio_sqrts = sqrt(r)
anchor_height.append(scales[i] / ratio_sqrts)
anchor_width.append(scales[i] * ratio_sqrts)
stride = options.strides[layer_id]
feature_map_height = ceil(options.input_size_height / stride)
feature_map_width = ceil(options.input_size_width / stride)
for y in range(feature_map_height):
for x in range(feature_map_width):
for anchor_id in range(len(anchor_height)):
x_center = (x + options.anchor_offset_x) / feature_map_width
y_center = (y + options.anchor_offset_y) / feature_map_height
# new_anchor = Anchor(x_center=x_center, y_center=y_center)
if options.fixed_anchor_size:
new_anchor = [x_center, y_center, 1.0, 1.0]
# new_anchor.w = 1.0
# new_anchor.h = 1.0
else:
new_anchor = [x_center, y_center, anchor_width[anchor_id], anchor_height[anchor_id]]
# new_anchor.w = anchor_width[anchor_id]
# new_anchor.h = anchor_height[anchor_id]
anchors.append(new_anchor)
layer_id = last_same_stride_layer
return np.array(anchors)
def generate_handtracker_anchors(input_size_width, input_size_height):
# https://github.com/google/mediapipe/blob/master/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt
anchor_options = SSDAnchorOptions(num_layers=4,
min_scale=0.1484375,
max_scale=0.75,
input_size_height=input_size_height,
input_size_width=input_size_width,
anchor_offset_x=0.5,
anchor_offset_y=0.5,
strides=[8, 16, 16, 16],
aspect_ratios=[1.0],
reduce_boxes_in_lowest_layer=False,
interpolated_scale_aspect_ratio=1.0,
fixed_anchor_size=True)
return generate_anchors(anchor_options)
def decode_bboxes(score_thresh, scores, bboxes, anchors, scale=128, best_only=False):
"""
wi, hi : NN input shape
mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
# Decodes the detection tensors generated by the model, based on
# the SSD anchors and the specification in the options, into a vector of
# detections. Each detection describes a detected object.
https://github.com/google/mediapipe/blob/master/mediapipe/modules/palm_detection/palm_detection_cpu.pbtxt :
node {
calculator: "TensorsToDetectionsCalculator"
input_stream: "TENSORS:detection_tensors"
input_side_packet: "ANCHORS:anchors"
output_stream: "DETECTIONS:unfiltered_detections"
options: {
[mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
num_classes: 1
num_boxes: 896
num_coords: 18
box_coord_offset: 0
keypoint_coord_offset: 4
num_keypoints: 7
num_values_per_keypoint: 2
sigmoid_score: true
score_clipping_thresh: 100.0
reverse_output_order: true
x_scale: 128.0
y_scale: 128.0
h_scale: 128.0
w_scale: 128.0
min_score_thresh: 0.5
}
}
}
node {
calculator: "TensorsToDetectionsCalculator"
input_stream: "TENSORS:detection_tensors"
input_side_packet: "ANCHORS:anchors"
output_stream: "DETECTIONS:unfiltered_detections"
options: {
[mediapipe.TensorsToDetectionsCalculatorOptions.ext] {
num_classes: 1
num_boxes: 2016
num_coords: 18
box_coord_offset: 0
keypoint_coord_offset: 4
num_keypoints: 7
num_values_per_keypoint: 2
sigmoid_score: true
score_clipping_thresh: 100.0
reverse_output_order: true
x_scale: 192.0
y_scale: 192.0
w_scale: 192.0
h_scale: 192.0
min_score_thresh: 0.5
}
}
}
scores: shape = [number of anchors 896 or 2016]
bboxes: shape = [ number of anchors x 18], 18 = 4 (bounding box : (cx,cy,w,h) + 14 (7 palm keypoints)
"""
regions = []
scores = 1 / (1 + np.exp(-scores))
if best_only:
best_id = np.argmax(scores)
if scores[best_id] < score_thresh: return regions
det_scores = scores[best_id:best_id + 1]
det_bboxes2 = bboxes[best_id:best_id + 1]
det_anchors = anchors[best_id:best_id + 1]
else:
detection_mask = scores > score_thresh
det_scores = scores[detection_mask]
if det_scores.size == 0: return regions
det_bboxes2 = bboxes[detection_mask]
det_anchors = anchors[detection_mask]
# scale = 128 # x_scale, y_scale, w_scale, h_scale
# scale = 192 # x_scale, y_scale, w_scale, h_scale
# cx, cy, w, h = bboxes[i,:4]
# cx = cx * anchor.w / wi + anchor.x_center
# cy = cy * anchor.h / hi + anchor.y_center
# lx = lx * anchor.w / wi + anchor.x_center
# ly = ly * anchor.h / hi + anchor.y_center
det_bboxes = det_bboxes2 * np.tile(det_anchors[:, 2:4], 9) / scale + np.tile(det_anchors[:, 0:2], 9)
# w = w * anchor.w / wi (in the prvious line, we add anchor.x_center and anchor.y_center to w and h, we need to substract them now)
# h = h * anchor.h / hi
det_bboxes[:, 2:4] = det_bboxes[:, 2:4] - det_anchors[:, 0:2]
# box = [cx - w*0.5, cy - h*0.5, w, h]
det_bboxes[:, 0:2] = det_bboxes[:, 0:2] - det_bboxes[:, 3:4] * 0.5
for i in range(det_bboxes.shape[0]):
score = det_scores[i]
box = det_bboxes[i, 0:4]
# Decoded detection boxes could have negative values for width/height due
# to model prediction. Filter out those boxes
if box[2] < 0 or box[3] < 0: continue
kps = []
# 0 : wrist
# 1 : index finger joint
# 2 : middle finger joint
# 3 : ring finger joint
# 4 : little finger joint
# 5 :
# 6 : thumb joint
# for j, name in enumerate(["0", "1", "2", "3", "4", "5", "6"]):
# kps[name] = det_bboxes[i,4+j*2:6+j*2]
for kp in range(7):
kps.append(det_bboxes[i, 4 + kp * 2:6 + kp * 2])
regions.append(HandRegion(float(score), box, kps))
return regions
# Starting from opencv 4.5.4, cv2.dnn.NMSBoxes output format changed
import re
cv2_version = cv2.__version__.split('.')
v0 = int(cv2_version[0])
v1 = int(cv2_version[1])
v2 = int(re.sub(r'\D+', '', cv2_version[2]))
if v0 > 4 or (v0 == 4 and (v1 > 5 or (v1 == 5 and v2 >= 4))):
def non_max_suppression(regions, nms_thresh):
# cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh) needs:
# boxes = [ [x, y, w, h], ...] with x, y, w, h of type int
# Currently, x, y, w, h are float between 0 and 1, so we arbitrarily multiply by 1000 and cast to int
# boxes = [r.box for r in regions]
boxes = [[int(x * 1000) for x in r.pd_box] for r in regions]
scores = [r.pd_score for r in regions]
indices = cv2.dnn.NMSBoxes(boxes, scores, 0,
nms_thresh) # Not using top_k=2 here because it does not give expected result. Bug ?
return [regions[i] for i in indices]
else:
def non_max_suppression(regions, nms_thresh):
# cv2.dnn.NMSBoxes(boxes, scores, 0, nms_thresh) needs:
# boxes = [ [x, y, w, h], ...] with x, y, w, h of type int
# Currently, x, y, w, h are float between 0 and 1, so we arbitrarily multiply by 1000 and cast to int
# boxes = [r.box for r in regions]
boxes = [[int(x * 1000) for x in r.pd_box] for r in regions]
scores = [r.pd_score for r in regions]
indices = cv2.dnn.NMSBoxes(boxes, scores, 0,
nms_thresh) # Not using top_k=2 here because it does not give expected result. Bug ?
return [regions[i[0]] for i in indices]
def normalize_radians(angle):
return angle - 2 * pi * floor((angle + pi) / (2 * pi))
def rot_vec(vec, rotation):
vx, vy = vec
return [vx * cos(rotation) - vy * sin(rotation), vx * sin(rotation) + vy * cos(rotation)]
def detections_to_rect(regions):
# https://github.com/google/mediapipe/blob/master/mediapipe/modules/hand_landmark/palm_detection_detection_to_roi.pbtxt
# # Converts results of palm detection into a rectangle (normalized by image size)
# # that encloses the palm and is rotated such that the line connecting center of
# # the wrist and MCP of the middle finger is aligned with the Y-axis of the
# # rectangle.
# node {
# calculator: "DetectionsToRectsCalculator"
# input_stream: "DETECTION:detection"
# input_stream: "IMAGE_SIZE:image_size"
# output_stream: "NORM_RECT:raw_roi"
# options: {
# [mediapipe.DetectionsToRectsCalculatorOptions.ext] {
# rotation_vector_start_keypoint_index: 0 # Center of wrist.
# rotation_vector_end_keypoint_index: 2 # MCP of middle finger.
# rotation_vector_target_angle_degrees: 90
# }
# }
target_angle = pi * 0.5 # 90 = pi/2
for region in regions:
region.rect_w = region.pd_box[2]
region.rect_h = region.pd_box[3]
region.rect_x_center = region.pd_box[0] + region.rect_w / 2
region.rect_y_center = region.pd_box[1] + region.rect_h / 2
x0, y0 = region.pd_kps[0] # wrist center
x1, y1 = region.pd_kps[2] # middle finger
rotation = target_angle - atan2(-(y1 - y0), x1 - x0)
region.rotation = normalize_radians(rotation)
def rotated_rect_to_points(cx, cy, w, h, rotation):
b = cos(rotation) * 0.5
a = sin(rotation) * 0.5
points = []
p0x = cx - a * h - b * w
p0y = cy + b * h - a * w
p1x = cx + a * h - b * w
p1y = cy - b * h - a * w
p2x = 2 * cx - p0x
p2y = 2 * cy - p0y
p3x = 2 * cx - p1x
p3y = 2 * cy - p1y
return np.array([[p0x, p0y], [p1x, p1y], [p2x, p2y], [p3x, p3y]], int)
def rect_transformation(regions, w, h):
"""
w, h : image input shape
"""
# https://github.com/google/mediapipe/blob/master/mediapipe/modules/hand_landmark/palm_detection_detection_to_roi.pbtxt
# # Expands and shifts the rectangle that contains the palm so that it's likely
# # to cover the entire hand.
# node {
# calculator: "RectTransformationCalculator"
# input_stream: "NORM_RECT:raw_roi"
# input_stream: "IMAGE_SIZE:image_size"
# output_stream: "roi"
# options: {
# [mediapipe.RectTransformationCalculatorOptions.ext] {
# scale_x: 2.6
# scale_y: 2.6
# shift_y: -0.5
# square_long: true
# }
# }
# IMHO 2.9 is better than 2.6. With 2.6, it may happen that finger tips stay outside of the bouding rotated rectangle
scale_x = 2.9
scale_y = 2.9
shift_x = 0
shift_y = -0.5
for region in regions:
width = region.rect_w
height = region.rect_h
rotation = region.rotation
if rotation == 0:
region.rect_x_center_a = (region.rect_x_center + width * shift_x) * w
region.rect_y_center_a = (region.rect_y_center + height * shift_y) * h
else:
x_shift = (w * width * shift_x * cos(rotation) - h * height * shift_y * sin(rotation)) # / w
y_shift = (w * width * shift_x * sin(rotation) + h * height * shift_y * cos(rotation)) # / h
region.rect_x_center_a = region.rect_x_center * w + x_shift
region.rect_y_center_a = region.rect_y_center * h + y_shift
# square_long: true
long_side = max(width * w, height * h)
region.rect_w_a = long_side * scale_x
region.rect_h_a = long_side * scale_y
region.rect_points = rotated_rect_to_points(region.rect_x_center_a, region.rect_y_center_a, region.rect_w_a,
region.rect_h_a, region.rotation)
def hand_landmarks_to_rect(hand):
# Calculates the ROI for the next frame from the current hand landmarks
id_wrist = 0
id_index_mcp = 5
id_middle_mcp = 9
id_ring_mcp = 13
lms_xy = hand.landmarks[:, :2]
# print(lms_xy)
# Compute rotation
x0, y0 = lms_xy[id_wrist]
x1, y1 = 0.25 * (lms_xy[id_index_mcp] + lms_xy[id_ring_mcp]) + 0.5 * lms_xy[id_middle_mcp]
rotation = 0.5 * pi - atan2(y0 - y1, x1 - x0)
rotation = normalize_radians(rotation)
# Now we work only on a subset of the landmarks
ids_for_bounding_box = [0, 1, 2, 3, 5, 6, 9, 10, 13, 14, 17, 18]
lms_xy = lms_xy[ids_for_bounding_box]
# Find center of the boundaries of landmarks
axis_aligned_center = 0.5 * (np.min(lms_xy, axis=0) + np.max(lms_xy, axis=0))
# Find boundaries of rotated landmarks
original = lms_xy - axis_aligned_center
c, s = np.cos(rotation), np.sin(rotation)
rot_mat = np.array(((c, -s), (s, c)))
projected = original.dot(rot_mat)
min_proj = np.min(projected, axis=0)
max_proj = np.max(projected, axis=0)
projected_center = 0.5 * (min_proj + max_proj)
center = rot_mat.dot(projected_center) + axis_aligned_center
width, height = max_proj - min_proj
next_hand = HandRegion()
next_hand.rect_w_a = next_hand.rect_h_a = 2 * max(width, height)
next_hand.rect_x_center_a = center[0] + 0.1 * height * s
next_hand.rect_y_center_a = center[1] - 0.1 * height * c
next_hand.rotation = rotation
next_hand.rect_points = rotated_rect_to_points(next_hand.rect_x_center_a, next_hand.rect_y_center_a,
next_hand.rect_w_a, next_hand.rect_h_a, next_hand.rotation)
return next_hand
def warp_rect_img(rect_points, img, w, h):
src = np.array(rect_points[1:], dtype=np.float32) # rect_points[0] is left bottom point !
dst = np.array([(0, 0), (h, 0), (h, w)], dtype=np.float32)
mat = cv2.getAffineTransform(src, dst)
return cv2.warpAffine(img, mat, (w, h))
def distance(a, b):
"""
a, b: 2 points (in 2D or 3D)
"""
return np.linalg.norm(a - b)
def angle(a, b, c):
# https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates
# a, b and c : points as np.array([x, y, z])
ba = a - b
bc = c - b
cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
angle = np.arccos(cosine_angle)
return np.degrees(angle)
def find_isp_scale_params(size, resolution, is_height=True):
"""
Find closest valid size close to 'size' and and the corresponding parameters to setIspScale()
This function is useful to work around a bug in depthai where ImageManip is scrambling images that have an invalid size
resolution: sensor resolution (width, height)
is_height : boolean that indicates if the value 'size' represents the height or the width of the image
Returns: valid size, (numerator, denominator)
"""
# We want size >= 288 (first compatible size > lm_input_size)
if size < 288:
size = 288
width, height = resolution
# We are looking for the list on integers that are divisible by 16 and
# that can be written like n/d where n <= 16 and d <= 63
if is_height:
reference = height
other = width
else:
reference = width
other = height
size_candidates = {}
for s in range(288, reference, 16):
f = gcd(reference, s)
n = s // f
d = reference // f
if n <= 16 and d <= 63 and int(round(other * n / d) % 2 == 0):
size_candidates[s] = (n, d)
# What is the candidate size closer to 'size' ?
min_dist = -1
for s in size_candidates:
dist = abs(size - s)
if min_dist == -1:
min_dist = dist
candidate = s
else:
if dist > min_dist: break
candidate = s
min_dist = dist
return candidate, size_candidates[candidate]
def recognize_gesture(hand):
# Finger states
# state: -1=unknown, 0=close, 1=open
d_3_5 = distance(hand.norm_landmarks[3], hand.norm_landmarks[5])
d_2_3 = distance(hand.norm_landmarks[2], hand.norm_landmarks[3])
angle0 = angle(hand.norm_landmarks[0], hand.norm_landmarks[1], hand.norm_landmarks[2])
angle1 = angle(hand.norm_landmarks[1], hand.norm_landmarks[2], hand.norm_landmarks[3])
angle2 = angle(hand.norm_landmarks[2], hand.norm_landmarks[3], hand.norm_landmarks[4])
hand.thumb_angle = angle0 + angle1 + angle2
if angle0 + angle1 + angle2 > 460 and d_3_5 / d_2_3 > 1.2:
hand.thumb_state = 1
else:
hand.thumb_state = 0
if hand.norm_landmarks[8][1] < hand.norm_landmarks[7][1] < hand.norm_landmarks[6][1]:
hand.index_state = 1
elif hand.norm_landmarks[6][1] < hand.norm_landmarks[8][1]:
hand.index_state = 0
else:
hand.index_state = -1
if hand.norm_landmarks[12][1] < hand.norm_landmarks[11][1] < hand.norm_landmarks[10][1]:
hand.middle_state = 1
elif hand.norm_landmarks[10][1] < hand.norm_landmarks[12][1]:
hand.middle_state = 0
else:
hand.middle_state = -1
if hand.norm_landmarks[16][1] < hand.norm_landmarks[15][1] < hand.norm_landmarks[14][1]:
hand.ring_state = 1
elif hand.norm_landmarks[14][1] < hand.norm_landmarks[16][1]:
hand.ring_state = 0
else:
hand.ring_state = -1
if hand.norm_landmarks[20][1] < hand.norm_landmarks[19][1] < hand.norm_landmarks[18][1]:
hand.little_state = 1
elif hand.norm_landmarks[18][1] < hand.norm_landmarks[20][1]:
hand.little_state = 0
else:
hand.little_state = -1
# Gesture
if hand.thumb_state == 1 and hand.index_state == 1 and hand.middle_state == 1 and hand.ring_state == 1 and hand.little_state == 1:
hand.gesture = "FIVE"
elif hand.thumb_state == 0 and hand.index_state == 0 and hand.middle_state == 0 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "FIST"
elif hand.thumb_state == 1 and hand.index_state == 0 and hand.middle_state == 0 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "OK"
elif hand.thumb_state == 0 and hand.index_state == 1 and hand.middle_state == 1 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "PEACE"
elif hand.thumb_state == 0 and hand.index_state == 1 and hand.middle_state == 0 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "ONE"
elif hand.thumb_state == 1 and hand.index_state == 1 and hand.middle_state == 0 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "TWO"
elif hand.thumb_state == 1 and hand.index_state == 1 and hand.middle_state == 1 and hand.ring_state == 0 and hand.little_state == 0:
hand.gesture = "THREE"
elif hand.thumb_state == 0 and hand.index_state == 1 and hand.middle_state == 1 and hand.ring_state == 1 and hand.little_state == 1:
hand.gesture = "FOUR"
else:
hand.gesture = None
# Movenet
class Body:
def __init__(self, scores=None, keypoints_norm=None, keypoints=None, score_thresh=None, crop_region=None,
next_crop_region=None):
"""
Attributes:
scores : scores of the keypoints
keypoints_norm : keypoints normalized ([0,1]) coordinates (x,y) in the squared cropped region
keypoints_square : keypoints coordinates (x,y) in pixels in the square padded image
keypoints : keypoints coordinates (x,y) in pixels in the source image (not padded)
score_thresh : score threshold used
crop_region : cropped region on which the current body was inferred
next_crop_region : cropping region calculated from the current body keypoints and that will be used on next frame
"""
self.scores = scores
self.keypoints_norm = keypoints_norm
self.keypoints = keypoints
self.score_thresh = score_thresh
self.crop_region = crop_region
self.next_crop_region = next_crop_region
# self.keypoints_square = (self.keypoints_norm * self.crop_region.size).astype(np.int)
self.keypoints = (np.array(
[self.crop_region.xmin, self.crop_region.ymin]) + self.keypoints_norm * self.crop_region.size).astype(
np.int)
def print(self):
attrs = vars(self)
print('\n'.join("%s: %s" % item for item in attrs.items()))
def distance_to_wrist(self, hand, wrist_handedness, pad_w=0, pad_h=0):
"""
Calculate the distance between a hand (class Hand) wrist position
and one of the body wrist given by wrist_handedness (= "left" or "right")
As the hand.landmarks cooordinates are expressed in the padded image, we must substract the padding (given by pad_w and pad_w)
to be coherent with the body keypoint coordinates which are expressed in the source image.
"""
return distance(hand.landmarks[0] - np.array([pad_w, pad_h]),
self.keypoints[BODY_KP[wrist_handedness + '_wrist']])
CropRegion = namedtuple('CropRegion', ['xmin', 'ymin', 'xmax', 'ymax',
'size']) # All values are in pixel. The region is a square of size 'size' pixels
# Dictionary that maps from joint names to keypoint indices.
BODY_KP = {
'nose': 0,
'left_eye': 1,
'right_eye': 2,
'left_ear': 3,
'right_ear': 4,
'left_shoulder': 5,
'right_shoulder': 6,
'left_elbow': 7,
'right_elbow': 8,
'left_wrist': 9,
'right_wrist': 10,
'left_hip': 11,
'right_hip': 12,
'left_knee': 13,
'right_knee': 14,
'left_ankle': 15,
'right_ankle': 16
}
class BodyPreFocusing:
"""
Body Pre Focusing with Movenet
Contains all is needed for :
- Movenet smart cropping (determines from the body detected in frame N,
the region of frame N+1 ow which the Movenet inference is run).
- Body Pre Focusing (determining from the Movenet wrist keypoints a smaller zone
on which Palm detection is run).
Both Smart cropping and Body Pre Focusing are important for model accuracy when
the body is far.
"""
def __init__(self, img_w, img_h, pad_w, pad_h, frame_size, mode="group", score_thresh=0.2, scale=1.0,
hands_up_only=True):
self.img_w = img_w
self.img_h = img_h
self.pad_w = pad_w
self.pad_h = pad_h
self.frame_size = frame_size
self.mode = mode
self.score_thresh = score_thresh
self.scale = scale
self.hands_up_only = hands_up_only
# Defines the default crop region (pads the full image from both sides to make it a square image)
# Used when the algorithm cannot reliably determine the crop region from the previous frame.
self.init_crop_region = CropRegion(-self.pad_w, -self.pad_h, -self.pad_w + self.frame_size,
-self.pad_h + self.frame_size, self.frame_size)
"""
Smart cropping stuff
"""
def crop_and_resize(self, frame, crop_region):
"""Crops and resize the image to prepare for the model input."""
cropped = frame[max(0, crop_region.ymin):min(self.img_h, crop_region.ymax),
max(0, crop_region.xmin):min(self.img_w, crop_region.xmax)]
if crop_region.xmin < 0 or crop_region.xmax >= self.img_w or crop_region.ymin < 0 or crop_region.ymax >= self.img_h:
# Padding is necessary
cropped = cv2.copyMakeBorder(cropped,
max(0, -crop_region.ymin),
max(0, crop_region.ymax - self.img_h),
max(0, -crop_region.xmin),
max(0, crop_region.xmax - self.img_w),
cv2.BORDER_CONSTANT)
cropped = cv2.resize(cropped, (self.pd_input_length, self.pd_input_length), interpolation=cv2.INTER_AREA)
return cropped
def torso_visible(self, scores):
"""Checks whether there are enough torso keypoints.
This function checks whether the model is confident at predicting one of the
shoulders/hips which is required to determine a good crop region.
"""
return ((scores[BODY_KP['left_hip']] > self.score_thresh or
scores[BODY_KP['right_hip']] > self.score_thresh) and
(scores[BODY_KP['left_shoulder']] > self.score_thresh or
scores[BODY_KP['right_shoulder']] > self.score_thresh))
def determine_torso_and_body_range(self, keypoints, scores, center_x, center_y):
"""Calculates the maximum distance from each keypoints to the center location.
The function returns the maximum distances from the two sets of keypoints:
full 17 keypoints and 4 torso keypoints. The returned information will be
used to determine the crop size. See determine_crop_region for more detail.
"""
torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
max_torso_yrange = 0.0
max_torso_xrange = 0.0
for joint in torso_joints:
dist_y = abs(center_y - keypoints[BODY_KP[joint]][1])
dist_x = abs(center_x - keypoints[BODY_KP[joint]][0])
if dist_y > max_torso_yrange:
max_torso_yrange = dist_y
if dist_x > max_torso_xrange:
max_torso_xrange = dist_x
max_body_yrange = 0.0
max_body_xrange = 0.0
for i in range(len(BODY_KP)):
if scores[i] < self.score_thresh:
continue
dist_y = abs(center_y - keypoints[i][1])
dist_x = abs(center_x - keypoints[i][0])
if dist_y > max_body_yrange:
max_body_yrange = dist_y
if dist_x > max_body_xrange:
max_body_xrange = dist_x
return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]
def determine_crop_region(self, body):
"""Determines the region to crop the image for the model to run inference on.
The algorithm uses the detected joints from the previous frame to estimate
the square region that encloses the full body of the target person and
centers at the midpoint of two hip joints. The crop size is determined by
the distances between each joints and the center point.
When the model is not confident with the four torso joint predictions, the
function returns a default crop which is the full image padded to square.
"""
if self.torso_visible(body.scores):
center_x = (body.keypoints[BODY_KP['left_hip']][0] + body.keypoints[BODY_KP['right_hip']][0]) // 2
center_y = (body.keypoints[BODY_KP['left_hip']][1] + body.keypoints[BODY_KP['right_hip']][1]) // 2
max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange = self.determine_torso_and_body_range(
body.keypoints, body.scores, center_x, center_y)
crop_length_half = np.amax(
[max_torso_xrange * 1.9, max_torso_yrange * 1.9, max_body_yrange * 1.2, max_body_xrange * 1.2])
tmp = np.array([center_x, self.img_w - center_x, center_y, self.img_h - center_y])
crop_length_half = int(round(np.amin([crop_length_half, np.amax(tmp)])))
crop_corner = [center_x - crop_length_half, center_y - crop_length_half]
if crop_length_half > max(self.img_w, self.img_h) / 2:
return self.init_crop_region
else:
crop_length = crop_length_half * 2
return CropRegion(crop_corner[0], crop_corner[1], crop_corner[0] + crop_length,
crop_corner[1] + crop_length, crop_length)
else:
return self.init_crop_region
"""
Body Pre Focusing stuff
"""
def torso_visible(self, scores):
"""Checks whether there are enough torso keypoints.
This function checks whether the model is confident at predicting one of the
shoulders/hips which is required to determine a good crop region.
"""
return ((scores[BODY_KP['left_hip']] > self.score_thresh or
scores[BODY_KP['right_hip']] > self.score_thresh) and
(scores[BODY_KP['left_shoulder']] > self.score_thresh or
scores[BODY_KP['right_shoulder']] > self.score_thresh))
def determine_torso_and_body_range(self, keypoints, scores, center_x, center_y):
"""Calculates the maximum distance from each keypoints to the center location.
The function returns the maximum distances from the two sets of keypoints:
full 17 keypoints and 4 torso keypoints. The returned information will be
used to determine the crop size. See determine_crop_region for more detail.
"""
torso_joints = ['left_shoulder', 'right_shoulder', 'left_hip', 'right_hip']
max_torso_yrange = 0.0
max_torso_xrange = 0.0
for joint in torso_joints:
dist_y = abs(center_y - keypoints[BODY_KP[joint]][1])
dist_x = abs(center_x - keypoints[BODY_KP[joint]][0])
if dist_y > max_torso_yrange:
max_torso_yrange = dist_y
if dist_x > max_torso_xrange:
max_torso_xrange = dist_x
max_body_yrange = 0.0
max_body_xrange = 0.0
for i in range(len(BODY_KP)):
if scores[i] < self.score_thresh:
continue
dist_y = abs(center_y - keypoints[i][1])
dist_x = abs(center_x - keypoints[i][0])
if dist_y > max_body_yrange:
max_body_yrange = dist_y
if dist_x > max_body_xrange:
max_body_xrange = dist_x
return [max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange]
def determine_crop_region(self, body):
"""Determines the region to crop the image for the model to run inference on.
The algorithm uses the detected joints from the previous frame to estimate
the square region that encloses the full body of the target person and
centers at the midpoint of two hip joints. The crop size is determined by
the distances between each joints and the center point.
When the model is not confident with the four torso joint predictions, the
function returns a default crop which is the full image padded to square.
"""
if self.torso_visible(body.scores):
center_x = (body.keypoints[BODY_KP['left_hip']][0] + body.keypoints[BODY_KP['right_hip']][0]) // 2
center_y = (body.keypoints[BODY_KP['left_hip']][1] + body.keypoints[BODY_KP['right_hip']][1]) // 2
max_torso_yrange, max_torso_xrange, max_body_yrange, max_body_xrange = self.determine_torso_and_body_range(
body.keypoints, body.scores, center_x, center_y)
crop_length_half = np.amax(
[max_torso_xrange * 1.9, max_torso_yrange * 1.9, max_body_yrange * 1.2, max_body_xrange * 1.2])
tmp = np.array([center_x, self.img_w - center_x, center_y, self.img_h - center_y])
crop_length_half = int(round(np.amin([crop_length_half, np.amax(tmp)])))
crop_corner = [center_x - crop_length_half, center_y - crop_length_half]
if crop_length_half > max(self.img_w, self.img_h) / 2:
return self.init_crop_region
else:
crop_length = crop_length_half * 2
return CropRegion(crop_corner[0], crop_corner[1], crop_corner[0] + crop_length,
crop_corner[1] + crop_length, crop_length)
else:
return self.init_crop_region
def estimate_focus_zone_size(self, body):
"""
This function is called if at least the segment "wrist_elbow" is visible.
We calculate the length of every segment from a predefined list. A segment length
is the distance between the 2 endpoints weighted by a coefficient. The weight have been chosen
so that the length of all segments are roughly equal. We take the maximal length to estimate
the size of the focus zone.
If no segment are vissible, we consider the body is very close
to the camera, and therefore there is no need to focus. Return 0
To not have at least one shoulder and one hip visible means the body is also very close
and the estimated size needs to be adjusted (bigger)
"""
segments = [
("left_shoulder", "left_elbow", 2.3),
("left_elbow", "left_wrist", 2.3),
("left_shoulder", "left_hip", 1),
("left_shoulder", "right_shoulder", 1.5),
("right_shoulder", "right_elbow", 2.3),
("right_elbow", "right_wrist", 2.3),
("right_shoulder", "right_hip", 1),
]
lengths = []
for s in segments:
if body.scores[BODY_KP[s[0]]] > self.score_thresh and body.scores[BODY_KP[s[1]]] > self.score_thresh:
l = np.linalg.norm(body.keypoints[BODY_KP[s[0]]] - body.keypoints[BODY_KP[s[1]]])
lengths.append(l)
if lengths:
if (body.scores[BODY_KP["left_hip"]] < self.score_thresh and
body.scores[BODY_KP["right_hip"]] < self.score_thresh or
body.scores[BODY_KP["left_shoulder"]] < self.score_thresh and
body.scores[BODY_KP["right_shoulder"]] < self.score_thresh):
coef = 1.5
else:
coef = 1.0
return 2 * int(coef * self.scale * max(lengths) / 2) # The size is made even
else:
return 0
def get_focus_zone(self, body):
"""
Return a tuple (focus_zone, label)
'body' = instance of class Body
'focus_zone' is a zone around a hand or hands, depending on the value
of self.mode ("left", "right", "higher" or "group") and on the value of self.hands_up_only.
- self.mode = "left" (resp "right"): we are looking for the zone around the left (resp right) wrist,
- self.mode = "group": the zone encompasses both wrists,
- self.mode = "higher": the zone is around the higher wrist (smaller y value),
- self.hands_up_only = True: we don't take into consideration the wrist if the corresponding elbow is above the wrist,
focus_zone is a list [left, top, right, bottom] defining the top-left and right-bottom corners of a square.
Values are expressed in pixels in the source image C.S.
The zone is constrained to the squared source image (= source image with padding if necessary).
It means that values can be negative.
left and right in [-pad_w, img_w + pad_w]
top and bottom in [-pad_h, img_h + pad_h]
'label' describes which wrist keypoint(s) were used to build the zone : "left", "right" or "group" (if built from both wrists)
If the wrist keypoint(s) is(are) not present or is(are) present but self.hands_up_only = True and
wrist(s) is(are) below corresponding elbow(s), then focus_zone = None.
"""
def zone_from_center_size(x, y, size):
"""
Return zone [left, top, right, bottom]
from zone center (x,y) and zone size (the zone is square).
"""
half_size = size // 2
size = half_size * 2
if size > self.img_w:
x = self.img_w // 2
x1 = x - half_size
if x1 < -self.pad_w:
x1 = -self.pad_w
elif x1 + size > self.img_w + self.pad_w:
x1 = self.img_w + self.pad_w - size
x2 = x1 + size
if size > self.img_h:
y = self.img_h // 2
y1 = y - half_size
if y1 < -self.pad_h:
y1 = -self.pad_h
elif y1 + size > self.img_h + self.pad_h:
y1 = self.img_h + self.pad_h - size
y2 = y1 + size
return [x1, y1, x2, y2]
def get_one_hand_zone(hand_label):
"""
Return the zone [left, top, right, bottom] around the hand given by its label "hand_label" ("left" or "right")
Values are expressed in pixels in the source image C.S.
If the wrist keypoint is not visible, return None.
If self.hands_up_only is True, return None if wrist keypoint is below elbow keypoint.
"""
wrist_kp = hand_label + "_wrist"
wrist_score = body.scores[BODY_KP[wrist_kp]]
if wrist_score < self.score_thresh:
return None
x, y = body.keypoints[BODY_KP[wrist_kp]]
if self.hands_up_only:
# We want to detect only hands where the wrist is above the elbow (when visible)
elbow_kp = hand_label + "_elbow"
if body.scores[BODY_KP[elbow_kp]] > self.score_thresh and \
body.keypoints[BODY_KP[elbow_kp]][1] < body.keypoints[BODY_KP[wrist_kp]][1]:
return None
# Let's evaluate the size of the focus zone
size = self.estimate_focus_zone_size(body)
if size == 0: return [-self.pad_w, -self.pad_h, self.frame_size - self.pad_w,
self.frame_size - self.pad_h] # The hand is too close. No need to focus
return zone_from_center_size(x, y, size)
if self.mode == "group":
zonel = get_one_hand_zone("left")
if zonel:
zoner = get_one_hand_zone("right")
if zoner:
xl1, yl1, xl2, yl2 = zonel
xr1, yr1, xr2, yr2 = zoner
x1 = min(xl1, xr1)
y1 = min(yl1, yr1)
x2 = max(xl2, xr2)
y2 = max(yl2, yr2)
# Global zone center (x,y)
x = int((x1 + x2) / 2)
y = int((y1 + y2) / 2)
size_x = x2 - x1
size_y = y2 - y1
size = 2 * (max(size_x, size_y) // 2)
return (zone_from_center_size(x, y, size), "group")
else:
return (zonel, "left")
else:
return (get_one_hand_zone("right"), "right")
elif self.mode == "higher":