|
5 | 5 | import pandas as pd
|
6 | 6 | import pooch
|
7 | 7 |
|
| 8 | +from importlib.metadata import version as installed_version |
8 | 9 | from importlib.resources import files
|
9 | 10 |
|
10 |
| - |
11 | 11 | __all__ = [
|
12 | 12 | "available_datasets",
|
13 |
| - "get_registry_filepath", |
14 | 13 | "fetch",
|
15 | 14 | "read_dreams",
|
16 | 15 | "read_info",
|
17 | 16 | ]
|
18 | 17 |
|
19 | 18 |
|
| 19 | +repository = pooch.create( |
| 20 | + base_url="https://github.com/dxelab/dreambank/raw/{version}/data/", |
| 21 | + path=pooch.os_cache("dreambank"), |
| 22 | + version=f"v{installed_version("dreambank")}", |
| 23 | + version_dev="dev", |
| 24 | +) |
| 25 | +repository.load_registry(files("dreambank.data").joinpath("registry.txt")) |
20 | 26 |
|
21 |
| -registry_hashes = { |
22 |
| - "main": "sha256:c2307b5f93ec13883b472d4764488b2386acf604aca7b78ad760656898247bcb", |
23 |
| -} |
24 |
| - |
25 |
| -def get_registry_filepath(version_str): |
26 |
| - version_str = pooch.check_version(version_str, fallback="main") |
27 |
| - url = f"https://github.com/dxelab/dreambank/raw/{version_str}/registry.txt" |
28 |
| - known_hash = registry_hashes[version_str] |
29 |
| - fp = pooch.retrieve(url, known_hash=known_hash, path=pooch.os_cache("dreambank")) |
30 |
| - return fp |
31 | 27 |
|
32 |
| -def create_pup(version): |
33 |
| - # Offers version control. |
34 |
| - version_str = f"v{version}" |
35 |
| - pup = pooch.create( |
36 |
| - path=pooch.os_cache("dreambank"), |
37 |
| - base_url="https://github.com/dxelab/dreambank/raw/{version}/data/", |
38 |
| - version=version_str, |
39 |
| - version_dev="main", |
40 |
| - ) |
41 |
| - registry_filepath = get_registry_filepath(version_str) |
42 |
| - pup.load_registry(registry_filepath) |
43 |
| - return pup |
44 | 28 |
|
| 29 | +def available_datasets(): |
| 30 | + """Return a list of all unique dataset IDs available in `dreambank`. |
45 | 31 |
|
46 |
| -def available_datasets(version=1): |
47 |
| - return sorted(set(x.split(".")[0] for x in create_pup(version).registry_files)) |
| 32 | + Returns |
| 33 | + ------- |
| 34 | + dataset_ids : list |
| 35 | + A sorted list of strings, each a unique dataset ID. |
48 | 36 |
|
49 |
| -def fetch(fname, version=1): |
| 37 | + Examples |
| 38 | + -------- |
| 39 | + >>> import dreambank |
| 40 | + >>> dataset_ids = dreambank.available_datasets() |
| 41 | + >>> print(dataset_ids[:5]) |
| 42 | + ['alta', 'angie', 'arlie', 'b', 'b-baseline'] |
| 43 | + >>> print(dataset_ids[-5:]) |
| 44 | + ['vonuslar', 'wedding', 'west_coast_teens', 'zurich-f', 'zurich-m'] |
50 | 45 | """
|
51 |
| - If you just want filepath to load manually |
| 46 | + return sorted(set(fn.split(".")[0] for fn in repository.registry_files)) |
| 47 | + |
| 48 | + |
| 49 | +def fetch(fname): |
| 50 | + """Fetch a single `dreambank` file and return the filepath. |
| 51 | +
|
| 52 | + The main use case of this would be if a user wants to read the file with custom code. |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + fname : str |
| 57 | + Dataset ID and extension (e.g., ``'alta.tsv'``, ``'alta.json'``). |
| 58 | +
|
| 59 | + Returns |
| 60 | + ------- |
| 61 | + fp : str |
| 62 | + Full filepath of local file. |
| 63 | +
|
| 64 | + Examples |
| 65 | + -------- |
| 66 | + >>> import dreambank |
| 67 | + >>> import pandas as pd |
| 68 | + >>> |
| 69 | + >>> fp = dreambank.fetch("bosnak.tsv") |
| 70 | + >>> bosnak = pd.read_table(fp, index_col="n") |
52 | 71 | """
|
53 |
| - fp = create_pup(version).fetch(fname) |
| 72 | + fp = repository.fetch(fname) |
54 | 73 | return fp
|
55 | 74 |
|
56 |
| -def read_dreams(dataset_id, version=1): |
57 |
| - fp = create_pup(version).fetch(f"{dataset_id}.tsv") |
58 |
| - return pd.read_table(fp) |
| 75 | +def read_dreams(dataset_id): |
| 76 | + """Return a :class:`pandas.DataFrame` of dreams. |
59 | 77 |
|
60 |
| -def read_info(fname, version=1): |
61 |
| - fp = create_pup(version).fetch(f"{dataset_id}.json") |
| 78 | + Parameters |
| 79 | + ---------- |
| 80 | + dataset_id : str |
| 81 | + The dataset to read in. |
| 82 | +
|
| 83 | + Returns |
| 84 | + ------- |
| 85 | + dreams : :class:`pandas.DataFrame` |
| 86 | + A :class:`~pandas.DataFrame` with 2 or 3 columns. |
| 87 | +
|
| 88 | + Examples |
| 89 | + -------- |
| 90 | + >>> import dreambank |
| 91 | + >>> dreams = dreambank.read_dreams("izzy22_25") |
| 92 | + >>> dreams.head(3) |
| 93 | + """ |
| 94 | + fp = fetch(f"{dataset_id}.tsv") |
| 95 | + dreams = pd.read_table(fp, dtype="string") |
| 96 | + return dreams |
| 97 | + |
| 98 | +def read_info(dataset_id): |
| 99 | + """Read info (i.e., metadata) for a given dataset. |
| 100 | +
|
| 101 | + Parameters |
| 102 | + ---------- |
| 103 | + dataset_id : str |
| 104 | + The dataset to read in. |
| 105 | +
|
| 106 | + Returns |
| 107 | + ------- |
| 108 | + info : dict |
| 109 | + A dictionary with metadata for the given dataset. |
| 110 | + |
| 111 | + * ``short_name``: dataset_id |
| 112 | + * ``long_name``: long_name |
| 113 | + * ``n_dreams``: n_dreams |
| 114 | + * ``timeframe``: timeframe |
| 115 | + * ``sex``: sex |
| 116 | + * ``description``: description |
| 117 | +
|
| 118 | + Examples |
| 119 | + -------- |
| 120 | + >>> import dreambank |
| 121 | + >>> info = dreambank.read_info("izzy22_25") |
| 122 | + >>> info |
| 123 | + """ |
| 124 | + fp = fetch(f"{dataset_id}.json") |
62 | 125 | with open(fp, "rt", encoding="utf-8") as f:
|
63 |
| - data = json.load(f) |
64 |
| - return data |
| 126 | + info = json.load(f) |
| 127 | + return info |
0 commit comments