Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QUESTION] CHAPTER 2: Custom Transformer Issue with Joblib #134

Open
AdrianVazquezTorres opened this issue May 30, 2024 · 0 comments
Open

Comments

@AdrianVazquezTorres
Copy link

Hi there, I've been dealing with a problem in the last few days with the custom transformer ClusterSimilarity (notebook: 02_end_to_end_machine_learning_project | cell: 96). It turns out that the custom transformer works correctly when doing the calculations, but when I try to save it using joblib it gives me the following error:

'NoneType' object is not iterable

Whether I try to save it within a pipeline or separately, it still gives me the same error. I realized that the problem is in the transform function, if I remove it I can save the custom transformer without problems.

IMPORTANT: This problem appears with all the custom transformers that I try to save NOT only with this one.

My code is exactly the same as the book:

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)

Here is the complete error:

TypeError                                 Traceback (most recent call last)
Cell In[43], line 1
----> 1 joblib.dump(cluster_simil, "test.pkl")

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:553, in dump(value, filename, compress, protocol, cache_size)
    551 elif is_filename:
    552     with open(filename, 'wb') as f:
--> 553         NumpyPickler(f, protocol=protocol).dump(value)
    554 else:
    555     NumpyPickler(filename, protocol=protocol).dump(value)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:487, in _Pickler.dump(self, obj)
    485 if self.proto >= 4:
    486     self.framer.start_framing()
--> 487 self.save(obj)
    488 self.write(STOP)
    489 self.framer.end_framing()

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:355, in NumpyPickler.save(self, obj)
    352     wrapper.write_array(obj, self)
    353     return
--> 355 return Pickler.save(self, obj)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:603, in _Pickler.save(self, obj, save_persistent_id)
    599     raise PicklingError("Tuple returned by %s must have "
    600                         "two to six elements" % reduce)
    602 # Save the reduce() output and finally memoize the object
--> 603 self.save_reduce(obj=obj, *rv)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:687, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
    684     raise PicklingError(
    685         "args[0] from __newobj__ args has the wrong class")
    686 args = args[1:]
--> 687 save(cls)
    688 save(args)
    689 write(NEWOBJ)

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:355, in NumpyPickler.save(self, obj)
    352     wrapper.write_array(obj, self)
    353     return
--> 355 return Pickler.save(self, obj)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
    558 f = self.dispatch.get(t)
    559 if f is not None:
--> 560     f(self, obj)  # Call unbound method with explicit self
    561     return
    563 # Check private dispatch table if any, or else
    564 # copyreg.dispatch_table

File ~\anaconda3\envs\data_science_py310\lib\site-packages\dill\_dill.py:1832, in save_type(pickler, obj, postproc_list)
   1829     postproc_list.append((setattr, (obj, '__qualname__', qualname)))
   1831 if not hasattr(obj, '__orig_bases__'):
-> 1832     _save_with_postproc(pickler, (_create_type, (
   1833         type(obj), obj.__name__, obj.__bases__, _dict
   1834     )), obj=obj, postproc_list=postproc_list)
   1835 else:
   1836     # This case will always work, but might be overkill.
   1837     _metadict = {
   1838         'metaclass': type(obj)
   1839     }

File ~\anaconda3\envs\data_science_py310\lib\site-packages\dill\_dill.py:1098, in _save_with_postproc(pickler, reduction, is_pickler_dill, obj, postproc_list)
   1095     pickler._postproc[id(obj)] = postproc_list
   1097 # TODO: Use state_setter in Python 3.8 to allow for faster cPickle implementations
-> 1098 pickler.save_reduce(*reduction, obj=obj)
   1100 if is_pickler_dill:
   1101     # pickler.x -= 1
   1102     # print(pickler.x*' ', 'pop', obj, id(obj))
   1103     postproc = pickler._postproc.pop(id(obj))

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:692, in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
    690 else:
    691     save(func)
--> 692     save(args)
    693     write(REDUCE)
    695 if obj is not None:
    696     # If the object is already in the memo, this means it is
    697     # recursive. In this case, throw away everything we put on the
    698     # stack, and fetch the object back from the memo.

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:355, in NumpyPickler.save(self, obj)
    352     wrapper.write_array(obj, self)
    353     return
--> 355 return Pickler.save(self, obj)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
    558 f = self.dispatch.get(t)
    559 if f is not None:
--> 560     f(self, obj)  # Call unbound method with explicit self
    561     return
    563 # Check private dispatch table if any, or else
    564 # copyreg.dispatch_table

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:902, in _Pickler.save_tuple(self, obj)
    900 write(MARK)
    901 for element in obj:
--> 902     save(element)
    904 if id(obj) in memo:
    905     # Subtle.  d was not in memo when we entered save_tuple(), so
    906     # the process of saving the tuple's elements must have saved
   (...)
    910     # could have been done in the "for element" loop instead, but
    911     # recursive tuples are a rare thing.
    912     get = self.get(memo[id(obj)][0])

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:355, in NumpyPickler.save(self, obj)
    352     wrapper.write_array(obj, self)
    353     return
--> 355 return Pickler.save(self, obj)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
    558 f = self.dispatch.get(t)
    559 if f is not None:
--> 560     f(self, obj)  # Call unbound method with explicit self
    561     return
    563 # Check private dispatch table if any, or else
    564 # copyreg.dispatch_table

File ~\anaconda3\envs\data_science_py310\lib\site-packages\dill\_dill.py:1217, in save_module_dict(pickler, obj)
   1214     if is_dill(pickler, child=False) and pickler._session:
   1215         # we only care about session the first pass thru
   1216         pickler._first_pass = False
-> 1217     StockPickler.save_dict(pickler, obj)
   1218     logger.trace(pickler, "# D2")
   1219 return

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:972, in _Pickler.save_dict(self, obj)
    969     self.write(MARK + DICT)
    971 self.memoize(obj)
--> 972 self._batch_setitems(obj.items())

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:998, in _Pickler._batch_setitems(self, items)
    996     for k, v in tmp:
    997         save(k)
--> 998         save(v)
    999     write(SETITEMS)
   1000 elif n:

File ~\anaconda3\envs\data_science_py310\lib\site-packages\joblib\numpy_pickle.py:355, in NumpyPickler.save(self, obj)
    352     wrapper.write_array(obj, self)
    353     return
--> 355 return Pickler.save(self, obj)

File ~\anaconda3\envs\data_science_py310\lib\pickle.py:560, in _Pickler.save(self, obj, save_persistent_id)
    558 f = self.dispatch.get(t)
    559 if f is not None:
--> 560     f(self, obj)  # Call unbound method with explicit self
    561     return
    563 # Check private dispatch table if any, or else
    564 # copyreg.dispatch_table

File ~\anaconda3\envs\data_science_py310\lib\site-packages\dill\_dill.py:1960, in save_function(pickler, obj)
   1955 if globs_copy is not None and globs is not globs_copy:
   1956     # In the case that the globals are copied, we need to ensure that
   1957     # the globals dictionary is updated when all objects in the
   1958     # dictionary are already created.
   1959     glob_ids = {id(g) for g in globs_copy.values()}
-> 1960     for stack_element in _postproc:
   1961         if stack_element in glob_ids:
   1962             _postproc[stack_element].append((_setitems, (globs, globs_copy)))

TypeError: 'NoneType' object is not iterable

Versions:

  • OS: Microsoft Windows 11 Home Single Language (10.0.22631)
  • Python: 3.10.14
  • TensorFlow: 2.10.1 (CUDA)
  • Scikit-Learn: 1.4.2
  • Joblib: 1.4.0

Thank you for reading.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant