diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py index 71a1ec5..c019683 100644 --- a/pyensembl/__init__.py +++ b/pyensembl/__init__.py @@ -17,7 +17,7 @@ from .memory_cache import MemoryCache from .download_cache import DownloadCache from .ensembl_release import EnsemblRelease -from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE +from .ensembl_release_versions import MAX_ENSEMBL_RELEASE from .exon import Exon from .genome import Genome from .gene import Gene @@ -35,28 +35,23 @@ ) from .transcript import Transcript -__version__ = '0.9.6' +__version__ = '0.9.7' -_cache = {} +def cached_release(release, species="human"): + """ + Create an EnsemblRelease instance only if it's hasn't already been made, + otherwise returns the old instance. -def cached_release(version, species="human"): - """Cached construction of EnsemblRelease objects. It's desirable to reuse - the same EnsemblRelease object since each one will store a lot of cached - annotation data in-memory. + Keeping this function for backwards compatibility but this functionality + has been moving into the cached method of EnsemblRelease. """ - version = check_release_number(version) - species = check_species_object(species) - key = (version, species) - if key not in _cache: - ensembl = EnsemblRelease(version, species=species) - _cache[key] = ensembl - return _cache[key] + return EnsemblRelease.cached(release=release, species=species) def genome_for_reference_name(reference_name): reference_name = normalize_reference_name(reference_name) species = find_species_by_reference(reference_name) (_, max_ensembl_release) = species.reference_assemblies[reference_name] - return cached_release(max_ensembl_release, species=species) + return cached_release(release=max_ensembl_release, species=species) ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18 ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19 diff --git a/pyensembl/ensembl_release.py b/pyensembl/ensembl_release.py index c6fe5f9..20a8967 100644 --- a/pyensembl/ensembl_release.py +++ b/pyensembl/ensembl_release.py @@ -16,6 +16,7 @@ Contains the EnsemblRelease class, which extends the Genome class to be specific to (a particular release of) Ensembl. """ +from weakref import WeakValueDictionary from .genome import Genome from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE @@ -32,18 +33,53 @@ class EnsemblRelease(Genome): Bundles together the genomic annotation and sequence data associated with a particular release of the Ensembl database. """ - def __init__(self, - release=MAX_ENSEMBL_RELEASE, - species=human, - server=ENSEMBL_FTP_SERVER): - self.release = check_release_number(release) - self.species = check_species_object(species) - self.server = server + + @classmethod + def normalize_init_values(cls, release, species, server): + """ + Normalizes the arguments which uniquely specify an EnsemblRelease + genome. + """ + release = check_release_number(release) + species = check_species_object(species) + return (release, species, server) + + # Using a WeakValueDictionary instead of an ordinary dict to prevent a + # memory leak in cases where we test many different releases in sequence. + # When all the references to a particular EnsemblRelease die then that + # genome should also be removed from this cache. + _genome_cache = WeakValueDictionary() + + @classmethod + def cached( + cls, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=ENSEMBL_FTP_SERVER): + """ + Construct EnsemblRelease if it's never been made before, otherwise + return an old instance. + """ + init_args_tuple = cls.normalize_init_values(release, species, server) + if init_args_tuple in cls._genome_cache: + genome = cls._genome_cache[init_args_tuple] + else: + genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple) + return genome + + def __init__( + self, + release=MAX_ENSEMBL_RELEASE, + species=human, + server=ENSEMBL_FTP_SERVER): + self.release, self.species, self.server = self.normalize_init_values( + release=release, species=species, server=server) self.gtf_url = make_gtf_url( ensembl_release=self.release, - species=species, - server=server) + species=self.species, + server=self.server) + self.transcript_fasta_url = make_fasta_url( ensembl_release=self.release, species=self.species.latin_name, @@ -53,7 +89,7 @@ def __init__(self, ensembl_release=self.release, species=self.species.latin_name, sequence_type="pep", - server=server) + server=self.server) self.reference_name = self.species.which_reference(self.release) @@ -92,3 +128,10 @@ def to_dict(self): "species": self.species, "server": self.server } + + @classmethod + def from_dict(cls, state_dict): + """ + Deserialize EnsemblRelease without creating duplicate instances. + """ + return cls.cached(**state_dict) diff --git a/test/test_serialization.py b/test/test_serialization.py index 8251fc8..a2bd13d 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -25,7 +25,6 @@ setup_init_custom_mouse_genome ) - @test_ensembl_releases() def test_pickle_ensembl_gene(ensembl_genome): gene = ensembl_genome.gene_by_id(TP53_gene_id) @@ -112,3 +111,10 @@ def test_species_to_json(): def test_species_to_pickle(): eq_(human, pickle.loads(pickle.dumps(human))) + + +@test_ensembl_releases() +def test_unique_memory_address_of_unpickled_genomes(ensembl_genome): + unpickled = pickle.loads(pickle.dumps(ensembl_genome)) + assert ensembl_genome is unpickled, \ + "Expected same object for %s but got two different instances" % (unpickled,)