@@ -31,30 +31,38 @@ def _join_arrays(row1, col1, data1,
31
31
#pylint: disable=too-many-arguments
32
32
#pylint: disable=too-many-locals
33
33
34
+ idx1 = np .lexsort ((col1 , row1 ))
35
+ idx2 = np .lexsort ((col2 , row2 ))
34
36
# join types
35
37
if join_type == "left" :
36
- idx_inner_left , idx_inner_right = get_idx (row1 , col1 , row2 , col2 , join_type = "inner" )
38
+ idx_inner_left , idx_inner_right , _ , _ , _ , _ = get_idx (row1 , col1 , row2 , col2 ,
39
+ idx1 , idx2 , join_type = "inner" )
37
40
data_join = set_and_fill_new_array (data1 , data2 , name ,
38
41
np .arange (0 , len (row1 )), np .arange (0 , len (row1 )),
39
42
idx_inner_right , idx_inner_left ,
40
43
len (row1 ))
41
44
return row1 , col1 , data_join
42
45
if join_type == "right" :
43
- idx_inner_left , idx_inner_right = get_idx (row1 , col1 , row2 , col2 , join_type = "inner" )
46
+ idx_inner_left , idx_inner_right , _ , _ , _ , _ = get_idx (row1 , col1 , row2 , col2 ,
47
+ idx1 , idx2 , join_type = "inner" )
44
48
data_join = set_and_fill_new_array (data1 , data2 , name ,
45
49
idx_inner_left , idx_inner_right ,
46
50
np .arange (0 , len (row2 )), np .arange (0 , len (row2 )),
47
51
len (row2 ))
48
52
return row2 , col2 , data_join
49
53
if join_type == "inner" :
50
- idx_inner_left , idx_inner_right = get_idx (row1 , col1 , row2 , col2 , join_type = "inner" )
54
+ idx_inner_left , idx_inner_right , _ , _ , _ , _ = get_idx (row1 , col1 , row2 , col2 ,
55
+ idx1 , idx2 , join_type = "inner" )
51
56
data_join = set_and_fill_new_array (data1 , data2 , name ,
52
57
idx_inner_left , np .arange (0 , len (idx_inner_left )),
53
58
idx_inner_right , np .arange (0 , len (idx_inner_left )),
54
59
len (idx_inner_left ))
55
60
return row1 [idx_inner_left ], col1 [idx_inner_left ], data_join
56
61
if join_type == "outer" :
57
- idx_left , idx_left_new , idx_right , idx_right_new , row_new , col_new = get_idx_outer (row1 , col1 , row2 , col2 )
62
+ idx_left , idx_right , idx_left_new , idx_right_new , row_new , col_new = get_idx_outer (
63
+ row1 , col1 , row2 , col2 ,
64
+ idx1 , idx2
65
+ )
58
66
data_join = set_and_fill_new_array (data1 , data2 , name ,
59
67
idx_left , idx_left_new , idx_right , idx_right_new ,
60
68
len (row_new ))
@@ -68,6 +76,7 @@ def set_and_fill_new_array(data1, data2, name,
68
76
"""Create new structured numpy array and fill with data1 and data2.
69
77
"""
70
78
#pylint: disable=too-many-arguments
79
+
71
80
new_dtype = [(dname , d [0 ]) for dname , d in data1 .dtype .fields .items ()]
72
81
if data2 .dtype .names is None :
73
82
new_dtype += [(name , data2 .dtype )]
@@ -92,69 +101,104 @@ def set_and_fill_new_array(data1, data2, name,
92
101
93
102
94
103
@numba .jit (nopython = True )
95
- def get_idx_inner_brute_force (left_row , left_col , right_row , right_col ):
96
- #Get indexes for entries for a inner join.
97
- idx_inner_left = []
98
- idx_inner_right = []
99
- for i , right_row_id in enumerate (right_row ):
100
- if right_row_id in left_row :
101
- idx = np .where ((left_row == right_row_id )
102
- & (left_col == right_col [i ]))[0 ]
103
- if len (idx ) > 0 :
104
- idx_inner_left .append (idx [0 ])
105
- idx_inner_right .append (i )
106
- return idx_inner_left , idx_inner_right
104
+ def get_idx_inner (left_row , left_col , right_row , right_col ,
105
+ idx1 , idx2 ):
106
+ """Get current and new indices for inner merge.
107
107
108
+ idx1, idx2
109
+ Numpy array of pre-sorted (np.lexsort) indices for left/right arrays.
110
+ """
111
+ #pylint: disable=too-many-arguments
112
+ #pylint: disable=too-many-locals
108
113
109
- @numba .jit (nopython = True )
110
- def get_idx (left_row , left_col , right_row , right_col ,
111
- join_type = "left" ):
112
- list1 = list (zip (left_row , left_col ))
113
- list2 = list (zip (right_row , right_col ))
114
- if join_type == "left" :
115
- uniques = set (list1 )
116
- elif join_type == "right" :
117
- uniques = set (list2 )
118
- elif join_type == "inner" :
119
- uniques = set (list1 ).intersection (set (list2 ))
120
- #elif join_type == "outer":
121
- # uniques = set(list1).union(set(list2))
122
- else :
123
- raise ValueError ("Unknown join_type" )
124
- uniques = sorted (list (uniques ))
125
114
idx_left = []
115
+ idx_left_new = []
126
116
idx_right = []
127
- for (r , c ) in uniques :
128
- i_left = np .where ((left_row == r ) & (left_col == c ))[0 ]
129
- if len (i_left ) > 0 :
130
- idx_left .append (i_left [0 ])
131
- i_right = np .where ((right_row == r ) & (right_col == c ))[0 ]
132
- if len (i_right ) > 0 :
133
- idx_right .append (i_right [0 ])
134
- return idx_left , idx_right
117
+ idx_right_new = []
118
+ row_new = []
119
+ col_new = []
120
+ low = 0
121
+ counter = 0
122
+ for i in idx1 :
123
+ for j in idx2 [low :]:
124
+ if (left_row [i ] == right_row [j ]) and (left_col [i ] == right_col [j ]):
125
+ idx_left .append (i )
126
+ idx_left_new .append (counter )
127
+ idx_right .append (j )
128
+ idx_right_new .append (counter )
129
+ row_new .append (left_row [i ])
130
+ col_new .append (left_col [i ])
131
+ counter += 1
132
+ if left_row [i ] > right_row [j ]:
133
+ low = j
134
+ if left_row [i ] < right_row [j ]:
135
+ break
136
+ return idx_left , idx_right , idx_left_new , idx_right_new , row_new , col_new
135
137
136
138
137
139
@numba .jit (nopython = True )
138
- def get_idx_outer (left_row , left_col , right_row , right_col ):
140
+ def get_idx_outer (left_row , left_col , right_row , right_col ,
141
+ idx1 , idx2 ):
142
+ """Get current and new indices for outer merge.
143
+
144
+ idx1, idx2
145
+ Numpy array of pre-sorted (np.lexsort) indices for left/right arrays.
146
+ """
147
+ #pylint: disable=too-many-arguments
139
148
#pylint: disable=too-many-locals
140
- uniques = set (zip (left_row , left_col )).union (set (zip (right_row , right_col )))
141
- uniques = sorted (list (uniques ))
142
149
143
150
idx_left = []
144
151
idx_left_new = []
145
152
idx_right = []
146
153
idx_right_new = []
147
154
row_new = []
148
155
col_new = []
149
- for i , (r , c ) in enumerate (uniques ):
150
- row_new .append (r )
151
- col_new .append (c )
152
- i_left = np .where ((left_row == r ) & (left_col == c ))[0 ]
153
- if len (i_left ) > 0 :
154
- idx_left .append (i_left [0 ])
155
- idx_left_new .append (i )
156
- i_right = np .where ((right_row == r ) & (right_col == c ))[0 ]
157
- if len (i_right ) > 0 :
158
- idx_right .append (i_right [0 ])
159
- idx_right_new .append (i )
160
- return idx_left , idx_left_new , idx_right , idx_right_new , row_new , col_new
156
+
157
+ right_in_inner = []
158
+ low = 0
159
+ counter = 0
160
+ for i in idx1 :
161
+ current_match = False
162
+ for j in idx2 [low :]:
163
+ if (left_row [i ] == right_row [j ]) and (left_col [i ] == right_col [j ]):
164
+ right_in_inner .append (j )
165
+ current_match = True
166
+ if left_row [i ] > right_row [j ]:
167
+ low = j
168
+ if left_row [i ] < right_row [j ]:
169
+ break
170
+ if current_match :
171
+ x = right_in_inner [- 1 ]
172
+ idx_left .append (i )
173
+ idx_left_new .append (counter )
174
+ idx_right .append (x )
175
+ idx_right_new .append (counter )
176
+ row_new .append (left_row [i ])
177
+ col_new .append (left_col [i ])
178
+ counter += 1
179
+ else :
180
+ idx_left .append (i )
181
+ idx_left_new .append (counter )
182
+ row_new .append (left_row [i ])
183
+ col_new .append (left_col [i ])
184
+ counter += 1
185
+
186
+ for j in set (idx2 ).difference (set (right_in_inner )):
187
+ idx_right .append (j )
188
+ idx_right_new .append (counter )
189
+ row_new .append (right_row [j ])
190
+ col_new .append (right_col [j ])
191
+ counter += 1
192
+ return idx_left , idx_right , idx_left_new , idx_right_new , row_new , col_new
193
+
194
+
195
+ def get_idx (left_row , left_col , right_row , right_col , idx1 , idx2 ,
196
+ join_type = "left" ):
197
+ #pylint: disable=too-many-arguments
198
+ if join_type == "inner" :
199
+ return get_idx_inner (left_row , left_col , right_row , right_col ,
200
+ idx1 , idx2 )
201
+ if join_type == "outer" :
202
+ return get_idx_outer (left_row , left_col , right_row , right_col ,
203
+ idx1 , idx2 )
204
+ raise ValueError ("Unknown join_type" )
0 commit comments