diff --git a/.all-contributorsrc b/.all-contributorsrc
index 99b1002ec..2769d40c6 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -608,6 +608,34 @@
"bug",
"maintenance"
]
+ },
+ {
+ "login": "jkuhl-uni",
+ "name": "Justus Kuhlmann",
+ "avatar_url": "https://avatars.githubusercontent.com/u/82444481?v=4",
+ "profile": "https://github.com/jkuhl-uni",
+ "contributions": [
+ "content"
+ ]
+ },
+ {
+ "login": "melanieganz",
+ "name": "melanieganz",
+ "avatar_url": "https://avatars.githubusercontent.com/u/25242978?v=4",
+ "profile": "https://sites.google.com/view/melanieganz/home",
+ "contributions": [
+ "bug"
+ ]
+ },
+ {
+ "login": "damienfrancois",
+ "name": "Damien François",
+ "avatar_url": "https://avatars.githubusercontent.com/u/1721582?v=4",
+ "profile": "http://www.uclouvain.be/damien.francois",
+ "contributions": [
+ "bug",
+ "content"
+ ]
}
],
"contributorsPerLine": 7,
diff --git a/.zenodo.json b/.zenodo.json
index 20de1a45a..ce5395be6 100644
--- a/.zenodo.json
+++ b/.zenodo.json
@@ -239,6 +239,16 @@
"affiliation": "National Eye Institute, National Institutes of Health, USA",
"orcid": "0000-0002-4579-003X"
},
+ {
+ "name": "Kuhlmann, Justus Theodor",
+ "affiliation": "Universität Münster",
+ "orcid": "0000-0001-5291-1939"
+ },
+ {
+ "name": "François, Damian",
+ "affiliation": "Université catholique de Louvain",
+ "orcid": "0000-0001-5131-9431"
+ },
{
"name": "Hanke, Michael",
"affiliation": "Institute of Neuroscience and Medicine, Brain & Behaviour (INM-7), Research Centre Jülich, Jülich, Germany and Institute of Systems Neuroscience, Medical Faculty, Heinrich Heine University Düsseldorf, Düsseldorf, Germany",
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21ffa08a9..3600bfb8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,7 +25,7 @@ All over the handbook, version notes or information relating to datalad versions
#### Advanced
#### Usecases
-- A new usecase about encrypted workflows is now part of the handbook ([#895][])
+- A new use case about encrypted workflows is now part of the handbook ([#895][])
#### Miscellaneous additions
- The Makefile in the source repository received a more intuitive and fine-grained structure ([#901][])
@@ -100,7 +100,7 @@ It includes contributions the new contributors @eort, @mslw, @tguiot, @jhpb7 and
- The GitHub project of the handbook now uses templates for easier issue generation. ([#768][])
- A number of CSS improvements fix the rendering of bullet points ([#770][])
-- The ML usecase was minified to speed up builds ([#790][])
+- The ML use case was minified to speed up builds ([#790][])
- A new code list for the DGPA workshop was added ([#820][])
## v0.15 (November 25 2021) -- LaTeX improvements
diff --git a/README.md b/README.md
index 7d6ae2108..b9a95060a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
[![Build status](https://ci.appveyor.com/api/projects/status/v8o167109n3irf5c/branch/main?svg=true)](https://ci.appveyor.com/project/mih/book/branch/main)
[![Monthly link check](https://github.com/datalad-handbook/book/actions/workflows/linkcheck.yml/badge.svg)](https://github.com/datalad-handbook/book/actions/workflows/linkcheck.yml)
-[![Documentation Status](https://readthedocs.org/projects/datalad-handbook/badge/?version=latest)](http://handbook.datalad.org/en/latest/?badge=latest)
+[![Documentation Status](https://readthedocs.org/projects/datalad-handbook/badge/?version=latest)](https://handbook.datalad.org/en/latest/?badge=latest)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3608611.svg)](https://doi.org/10.5281/zenodo.3608611)
[![made-with-datalad](https://www.datalad.org/badges/made_with.svg)](https://datalad.org)
-[![All Contributors](https://img.shields.io/badge/all_contributors-57-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-59-orange.svg?style=flat-square)](#contributors-)
# The DataLad handbook :orange_book:
@@ -69,7 +69,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
Kyle Meyer 🐛 👀 💬 🖋 🤔 |
Marisa Heckner 🤔 📓 🐛 🖋 |
Benjamin Poldrack 💬 🤔 💡 ✅ |
- Yaroslav Halchenko 👀 🖋 🤔 🐛 |
+ Yaroslav Halchenko 👀 🖋 🤔 🐛 |
Chris Markiewicz 🐛 |
@@ -136,6 +136,9 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
Danny Garside 🐛 🚧 |
+ Justus Kuhlmann 🖋 |
+ melanieganz 🐛 |
+ Damien François 🐛 🖋 |
diff --git a/artwork/OHBM_2020.svg b/artwork/OHBM_2020.svg
index d60baa0aa..df2a40b61 100644
--- a/artwork/OHBM_2020.svg
+++ b/artwork/OHBM_2020.svg
@@ -3356,8 +3356,8 @@
id="g6756"
transform="translate(0,-863.42482)">
Dict[str, str]:
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
@@ -33,8 +36,15 @@ def get_keywords():
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
+ VCS: str
+ style: str
+ tag_prefix: str
+ parentdir_prefix: str
+ versionfile_source: str
+ verbose: bool
-def get_config():
+
+def get_config() -> VersioneerConfig:
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
@@ -52,13 +62,13 @@ class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
-LONG_VERSION_PY = {}
-HANDLERS = {}
+LONG_VERSION_PY: Dict[str, str] = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}
-def register_vcs_handler(vcs, method): # decorator
- """Decorator to mark a method as the handler for a particular VCS."""
- def decorate(f):
+def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
+ """Create decorator to mark a method as the handler of a VCS."""
+ def decorate(f: Callable) -> Callable:
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
@@ -67,22 +77,35 @@ def decorate(f):
return decorate
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
- env=None):
+def run_command(
+ commands: List[str],
+ args: List[str],
+ cwd: Optional[str] = None,
+ verbose: bool = False,
+ hide_stderr: bool = False,
+ env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
"""Call the given command(s)."""
assert isinstance(commands, list)
- p = None
- for c in commands:
+ process = None
+
+ popen_kwargs: Dict[str, Any] = {}
+ if sys.platform == "win32":
+ # This hides the console window if pythonw.exe is used
+ startupinfo = subprocess.STARTUPINFO()
+ startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+ popen_kwargs["startupinfo"] = startupinfo
+
+ for command in commands:
try:
- dispcmd = str([c] + args)
+ dispcmd = str([command] + args)
# remember shell=False, so use git.cmd on windows, not just git
- p = subprocess.Popen([c] + args, cwd=cwd, env=env,
- stdout=subprocess.PIPE,
- stderr=(subprocess.PIPE if hide_stderr
- else None))
+ process = subprocess.Popen([command] + args, cwd=cwd, env=env,
+ stdout=subprocess.PIPE,
+ stderr=(subprocess.PIPE if hide_stderr
+ else None), **popen_kwargs)
break
- except EnvironmentError:
- e = sys.exc_info()[1]
+ except OSError as e:
if e.errno == errno.ENOENT:
continue
if verbose:
@@ -93,18 +116,20 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
if verbose:
print("unable to find command, tried %s" % (commands,))
return None, None
- stdout = p.communicate()[0].strip()
- if sys.version_info[0] >= 3:
- stdout = stdout.decode()
- if p.returncode != 0:
+ stdout = process.communicate()[0].strip().decode()
+ if process.returncode != 0:
if verbose:
print("unable to run %s (error)" % dispcmd)
print("stdout was %s" % stdout)
- return None, p.returncode
- return stdout, p.returncode
+ return None, process.returncode
+ return stdout, process.returncode
-def versions_from_parentdir(parentdir_prefix, root, verbose):
+def versions_from_parentdir(
+ parentdir_prefix: str,
+ root: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
@@ -113,15 +138,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
"""
rootdirs = []
- for i in range(3):
+ for _ in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
return {"version": dirname[len(parentdir_prefix):],
"full-revisionid": None,
"dirty": False, "error": None, "date": None}
- else:
- rootdirs.append(root)
- root = os.path.dirname(root) # up a level
+ rootdirs.append(root)
+ root = os.path.dirname(root) # up a level
if verbose:
print("Tried directories %s but none started with prefix %s" %
@@ -130,41 +154,48 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
- keywords = {}
+ keywords: Dict[str, str] = {}
try:
- f = open(versionfile_abs, "r")
- for line in f.readlines():
- if line.strip().startswith("git_refnames ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["refnames"] = mo.group(1)
- if line.strip().startswith("git_full ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["full"] = mo.group(1)
- if line.strip().startswith("git_date ="):
- mo = re.search(r'=\s*"(.*)"', line)
- if mo:
- keywords["date"] = mo.group(1)
- f.close()
- except EnvironmentError:
+ with open(versionfile_abs, "r") as fobj:
+ for line in fobj:
+ if line.strip().startswith("git_refnames ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["refnames"] = mo.group(1)
+ if line.strip().startswith("git_full ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["full"] = mo.group(1)
+ if line.strip().startswith("git_date ="):
+ mo = re.search(r'=\s*"(.*)"', line)
+ if mo:
+ keywords["date"] = mo.group(1)
+ except OSError:
pass
return keywords
@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
+def git_versions_from_keywords(
+ keywords: Dict[str, str],
+ tag_prefix: str,
+ verbose: bool,
+) -> Dict[str, Any]:
"""Get version information from git keywords."""
- if not keywords:
- raise NotThisMethod("no keywords at all, weird")
+ if "refnames" not in keywords:
+ raise NotThisMethod("Short version file found")
date = keywords.get("date")
if date is not None:
+ # Use only the last line. Previous lines may contain GPG signature
+ # information.
+ date = date.splitlines()[-1]
+
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
@@ -177,11 +208,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
- refs = set([r.strip() for r in refnames.strip("()").split(",")])
+ refs = {r.strip() for r in refnames.strip("()").split(",")}
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
- tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+ tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
@@ -190,7 +221,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
- tags = set([r for r in refs if re.search(r'\d', r)])
+ tags = {r for r in refs if re.search(r'\d', r)}
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
@@ -199,6 +230,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix):]
+ # Filter out refs that exactly match prefix or that don't start
+ # with a number once the prefix is stripped (mostly a concern
+ # when prefix is '')
+ if not re.match(r'\d', r):
+ continue
if verbose:
print("picking %s" % r)
return {"version": r,
@@ -214,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+def git_pieces_from_vcs(
+ tag_prefix: str,
+ root: str,
+ verbose: bool,
+ runner: Callable = run_command
+) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
@@ -225,8 +266,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
- out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
- hide_stderr=True)
+ # GIT_DIR can interfere with correct operation of Versioneer.
+ # It may be intended to be passed to the Versioneer-versioned project,
+ # but that should not change where we get our version from.
+ env = os.environ.copy()
+ env.pop("GIT_DIR", None)
+ runner = functools.partial(runner, env=env)
+
+ _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
+ hide_stderr=not verbose)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
@@ -234,24 +282,57 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
- describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long",
- "--match", "%s*" % tag_prefix],
- cwd=root)
+ describe_out, rc = runner(GITS, [
+ "describe", "--tags", "--dirty", "--always", "--long",
+ "--match", f"{tag_prefix}[[:digit:]]*"
+ ], cwd=root)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
- full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+ full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
- pieces = {}
+ pieces: Dict[str, Any] = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
+ branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
+ cwd=root)
+ # --abbrev-ref was added in git-1.6.3
+ if rc != 0 or branch_name is None:
+ raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+ branch_name = branch_name.strip()
+
+ if branch_name == "HEAD":
+ # If we aren't exactly on a branch, pick a branch which represents
+ # the current commit. If all else fails, we are on a branchless
+ # commit.
+ branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+ # --contains was added in git-1.5.4
+ if rc != 0 or branches is None:
+ raise NotThisMethod("'git branch --contains' returned error")
+ branches = branches.split("\n")
+
+ # Remove the first line if we're running detached
+ if "(" in branches[0]:
+ branches.pop(0)
+
+ # Strip off the leading "* " from the list of branches.
+ branches = [branch[2:] for branch in branches]
+ if "master" in branches:
+ branch_name = "master"
+ elif not branches:
+ branch_name = None
+ else:
+ # Pick the first branch that is returned. Good or bad.
+ branch_name = branches[0]
+
+ pieces["branch"] = branch_name
+
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
@@ -293,26 +374,27 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
else:
# HEX: no tags
pieces["closest-tag"] = None
- count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
- cwd=root)
- pieces["distance"] = int(count_out) # total number of commits
+ out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+ pieces["distance"] = len(out.split()) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
- date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
- cwd=root)[0].strip()
+ date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+ # Use only the last line. Previous lines may contain GPG signature
+ # information.
+ date = date.splitlines()[-1]
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
-def plus_or_dot(pieces):
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
-def render_pep440(pieces):
+def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
@@ -337,23 +419,71 @@ def render_pep440(pieces):
return rendered
-def render_pep440_pre(pieces):
- """TAG[.post.devDISTANCE] -- No -dirty.
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
+ """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+
+ The ".dev0" means not master branch. Note that .dev0 sorts backwards
+ (a feature branch will appear "older" than the master branch).
Exceptions:
- 1: no tags. 0.post.devDISTANCE
+ 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0"
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += "+untagged.%d.g%s" % (pieces["distance"],
+ pieces["short"])
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
+ """Split pep440 version string at the post-release segment.
+
+ Returns the release segments before the post-release and the
+ post-release version number (or -1 if no post-release segment is present).
+ """
+ vc = str.split(ver, ".post")
+ return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+
+
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
+ """TAG[.postN.devDISTANCE] -- No -dirty.
+
+ Exceptions:
+ 1: no tags. 0.post0.devDISTANCE
+ """
+ if pieces["closest-tag"]:
if pieces["distance"]:
- rendered += ".post.dev%d" % pieces["distance"]
+ # update the post release segment
+ tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+ rendered = tag_version
+ if post_version is not None:
+ rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+ else:
+ rendered += ".post0.dev%d" % (pieces["distance"])
+ else:
+ # no commits, use the tag as the version
+ rendered = pieces["closest-tag"]
else:
# exception #1
- rendered = "0.post.dev%d" % pieces["distance"]
+ rendered = "0.post0.dev%d" % pieces["distance"]
return rendered
-def render_pep440_post(pieces):
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
@@ -380,12 +510,41 @@ def render_pep440_post(pieces):
return rendered
-def render_pep440_old(pieces):
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
+ """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+
+ The ".dev0" means not master branch.
+
+ Exceptions:
+ 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+ """
+ if pieces["closest-tag"]:
+ rendered = pieces["closest-tag"]
+ if pieces["distance"] or pieces["dirty"]:
+ rendered += ".post%d" % pieces["distance"]
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += plus_or_dot(pieces)
+ rendered += "g%s" % pieces["short"]
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ else:
+ # exception #1
+ rendered = "0.post%d" % pieces["distance"]
+ if pieces["branch"] != "master":
+ rendered += ".dev0"
+ rendered += "+g%s" % pieces["short"]
+ if pieces["dirty"]:
+ rendered += ".dirty"
+ return rendered
+
+
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
- Eexceptions:
+ Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
@@ -402,7 +561,7 @@ def render_pep440_old(pieces):
return rendered
-def render_git_describe(pieces):
+def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
@@ -422,7 +581,7 @@ def render_git_describe(pieces):
return rendered
-def render_git_describe_long(pieces):
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
@@ -442,7 +601,7 @@ def render_git_describe_long(pieces):
return rendered
-def render(pieces, style):
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {"version": "unknown",
@@ -456,10 +615,14 @@ def render(pieces, style):
if style == "pep440":
rendered = render_pep440(pieces)
+ elif style == "pep440-branch":
+ rendered = render_pep440_branch(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
+ elif style == "pep440-post-branch":
+ rendered = render_pep440_post_branch(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
@@ -474,7 +637,7 @@ def render(pieces, style):
"date": pieces.get("date")}
-def get_versions():
+def get_versions() -> Dict[str, Any]:
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
@@ -495,7 +658,7 @@ def get_versions():
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
- for i in cfg.versionfile_source.split('/'):
+ for _ in cfg.versionfile_source.split('/'):
root = os.path.dirname(root)
except NameError:
return {"version": "0+unknown", "full-revisionid": None,
diff --git a/docs/OHBMposter.rst b/docs/OHBMposter.rst
index 406995c05..70a2a29d0 100644
--- a/docs/OHBMposter.rst
+++ b/docs/OHBMposter.rst
@@ -1,10 +1,9 @@
+.. index:: ! OHBM 2020 Poster
.. _ohbm2020poster:
Handbook Poster from the 2020 (virtual) OHBM
--------------------------------------------
-.. index:: ! OHBM 2020 Poster
-
Here you can find the poster about the DataLad Handbook, presented at the `2020 virtual OHBM `_ und poster number 1914.
.. only:: html
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index f23e2d75d..a2a002d3f 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -170,3 +170,15 @@ progress::-webkit-progress-bar {
content: "↷ "
}
+/* Get a compact URL index list */
+div.book-v1-url-index.docutils.container p {
+ display: inline;
+}
+div.book-v1-url-index.docutils.container dl.field-list {
+ display: grid;
+ grid-template-columns: max-content auto;
+ align-items: center;
+}
+div.book-v1-url-index.docutils.container dl.field-list dt {
+ word-break: unset !important;
+}
diff --git a/docs/_static/distribits-teaser.jpg b/docs/_static/distribits-teaser.jpg
deleted file mode 100644
index 87f1a0241..000000000
Binary files a/docs/_static/distribits-teaser.jpg and /dev/null differ
diff --git a/docs/_templates/sidebarintro.html b/docs/_templates/sidebarintro.html
index 51c70457c..65db31c1f 100644
--- a/docs/_templates/sidebarintro.html
+++ b/docs/_templates/sidebarintro.html
@@ -47,6 +47,7 @@ Contributors
Useful Links
+ - Get a real book!
- DataLad Website
- Developer Docs
- DataLad@GitHub
diff --git a/docs/acknowledgements.rst b/docs/acknowledgements.rst
index e594980a5..f24a7f3c0 100644
--- a/docs/acknowledgements.rst
+++ b/docs/acknowledgements.rst
@@ -1,5 +1,11 @@
Acknowledgements
----------------
+
+The DataLad software and its documentation are the joint work of more than 100 individuals.
+We are deeply grateful for these contributions to free and open source software (FOSS) and documentation.
+Likewise we are grateful to the many more people that produce and maintain the FOSS ecosystem that DataLad is built on.
+We are particularly indebted to Joey Hess, the author of the git-annex software, without which DataLad would not be what it is today.
+
The DataLad project received support through the following grants:
* US-German collaboration in computational neuroscience (CRCNS) project "DataGit: converging catalogues, warehouses, and deployment logistics into a federated 'data distribution'" (Halchenko/Hanke), co-funded by the US National Science Foundation (NSF 1429999) and the German Federal Ministry of Education and Research (BMBF 01GQ1411).
@@ -8,9 +14,10 @@ The DataLad project received support through the following grants:
* German federal state of Saxony-Anhalt and the European Regional Development Fund (ERDF), Project: Center for Behavioral Brain Sciences, Imaging Platform
* ReproNim project (NIH 1P41EB019936-01A1).
* Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under grant SFB 1451 (431549029, INF project)
-* European Union’s Horizon 2020 research and innovation programme under grant agreements:
- * Human Brain Project SGA3 (H2020-EU.3.1.5.3, grant no. 945539)
- * VirtualBrainCloud (H2020-EU.3.1.5.3, grant no. 826421)
+* European Union’s Horizon 2020 research and innovation programme under grant agreements
+
+ * Human Brain Project SGA3 (H2020-EU.3.1.5.3, grant no. 945539)
+ * VirtualBrainCloud (H2020-EU.3.1.5.3, grant no. 826421)
.. figure:: artwork/src/funding.svg
diff --git a/docs/artwork b/docs/artwork
index 803cc9cb9..7b9325c57 160000
--- a/docs/artwork
+++ b/docs/artwork
@@ -1 +1 @@
-Subproject commit 803cc9cb986691d5cb5472033a77f818c73e2fb4
+Subproject commit 7b9325c5720968db931e930aed7fbe54cd9ad879
diff --git a/docs/basics/101-101-create.rst b/docs/basics/101-101-create.rst
index 7e8b9bdfc..2d7a7baff 100644
--- a/docs/basics/101-101-create.rst
+++ b/docs/basics/101-101-create.rst
@@ -1,3 +1,6 @@
+.. index::
+ pair: create; DataLad command
+ pair: create dataset; with DataLad
.. _createDS:
Create a dataset
@@ -19,33 +22,35 @@ useful features.
Because experiencing is more insightful than just reading, we will explore the
concepts of DataLad datasets together by creating one.
-.. index:: ! datalad command; create
-
Find a nice place on your computer's file system to put a dataset for ``DataLad-101``,
and create a fresh, empty dataset with the :dlcmd:`create` command.
Note the command structure of :dlcmd:`create` (optional bits are enclosed in ``[ ]``):
-.. code-block:: bash
+.. code-block::
datalad create [--description "..."] [-c ] PATH
-.. find-out-more:: What is the description option of datalad-create?
+.. _createdescription:
+.. index::
+ pair: set description for dataset location; with DataLad
+.. find-out-more:: What is the description option of 'datalad create'?
The optional ``--description`` flag allows you to provide a short description of
the *location* of your dataset, for example with
- .. code-block:: bash
+ .. code-block:: console
- datalad create --description "course on DataLad-101 on my private Laptop" -c text2git DataLad-101
+ $ datalad create --description "course on DataLad-101 on my private laptop" -c text2git DataLad-101
- If you want, use the above command instead of the :dlcmd:`create` command below
- to provide a description. Its use will not be immediately clear, the chapter
- :ref:`chapter_collaboration`) will show you where this description
+ If you want, use the above command instead to provide a description. Its use will not be immediately clear now, but the chapter
+ :ref:`chapter_collaboration` shows where this description
ends up and how it may be useful.
Let's start:
+.. index::
+ pair: create dataset; with DataLad
.. runrecord:: _examples/DL-101-101-101
:language: console
:workdir: dl-101
@@ -78,7 +83,7 @@ can be tracked (should you want them to be tracked).
*Tracking* in this context means that edits done to a file are automatically
associated with information about the change, the author of the edit,
and the time of this change. This is already informative important on its own
--- the :term:`provenance` captured with this can for example be used to learn
+-- the :term:`provenance` captured with this can, for example, be used to learn
about a file's lineage, and can establish trust in it.
But what is especially helpful is that previous states of files or directories
can be restored. Remember the last time you accidentally deleted content
@@ -87,12 +92,17 @@ mistakes are forever. We will see many examples of this later in the book,
and such information is stored in what we will refer
to as the *history* of a dataset.
+.. index::
+ pair: log; Git command
+ pair: exit pager; in a terminal
+ pair: show history; with Git
+
This history is almost as small as it can be at the current state, but let's take
a look at it. For looking at the history, the code examples will use :gitcmd:`log`,
a built-in :term:`Git` command [#f1]_ that works right in your terminal. Your log
-*might* be opened in a `terminal pager `_
+*might* be opened in a terminal :term:`pager`
that lets you scroll up and down with your arrow keys, but not enter any more commands.
-If this happens, you can get out of git log by pressing ``q``.
+If this happens, you can get out of ``git log`` by pressing ``q``.
.. runrecord:: _examples/DL-101-101-103
:language: console
@@ -107,21 +117,13 @@ We can see two :term:`commit`\s in the history of the repository.
Each of them is identified by a unique 40 character sequence, called a
:term:`shasum`.
-.. windows-wit:: Your Git log may be more extensive - use "git log main" instead!
-
- The output of :gitcmd:`log` shown in the handbook and the output you will see in your own datasets when executing the same commands may not always match -- many times you might see commits about a "git-annex adjusted branch" in your history.
- This is expected, and if you want to read up more about this, please progress on to chapter :ref:`chapter_gitannex` and afterwards take a look at `this part of git-annex documentation `_.
-
- In order to get a similar experience in your dataset, please add the name of your default :term:`branch` (it will likely have the name ``main`` or ``master``) to every ``git log`` command.
- This should display the same output that the handbook displays.
- The reason behind this is that datasets are using a special :term:`branch` to be functional on Windows.
- This branch's history differs from the history that would be in the default branch.
- With this workaround, you will be able to display the dataset history from the same branch that the handbook and all other operating system display.
- Thus, whenever the handbook code snippet contains a line that starts with ``git log``, copy it and append the term ``main`` or ``master``, whichever is appropriate.
-
- If you are eager to help to improve the handbook, you could do us a favor by reporting any places with mismatches between Git logs on Windows and in the handbook.
- `Get in touch `_!
+.. index::
+ pair: log; Git command
+ pair: corresponding branch; in adjusted mode
+ pair: show history; on Windows
+.. windows-wit:: Your Git log may be more extensive - use 'git log main' instead!
+ .. include:: topic/adjustedmode-log.rst
Highlighted in this output is information about the author and about
the time, as well as a :term:`commit message` that summarizes the
@@ -135,6 +137,8 @@ While these commits were produced and described by DataLad,
in most other cases, you will have to create the commit and
an informative commit message yourself.
+.. index::
+ pair: create dataset; DataLad concept
.. gitusernote:: Create internals
:dlcmd:`create` uses :gitcmd:`init` and :gitannexcmd:`init`. Therefore,
diff --git a/docs/basics/101-102-populate.rst b/docs/basics/101-102-populate.rst
index 5b13c93f9..585dd59c9 100644
--- a/docs/basics/101-102-populate.rst
+++ b/docs/basics/101-102-populate.rst
@@ -16,6 +16,9 @@ Let's first create a directory to save books for additional reading in.
$ mkdir books
+.. index::
+ pair: tree; terminal command
+
Let's take a look at the current directory structure with the tree command [#f1]_:
.. runrecord:: _examples/DL-101-102-102
@@ -36,25 +39,20 @@ are all free, in total about 15 MB), and save them in ``DataLad-101/books``.
You can either visit the links and save them in ``books/``,
or run the following commands [#f2]_ to download the books right from the terminal.
-Note that we line break the command with ``\`` signs. In your own work you can write
+Note that we line break the command with ``\`` line continuation characters. In your own work you can write
commands like this into a single line. If you copy them into your terminal as they
are presented here, make sure to check the :windows-wit:`on peculiarities of its terminals
`.
+.. index::
+ pair: line continuation; on Windows in a terminal
.. windows-wit:: Terminals other than Git Bash can't handle multi-line commands
:name: ww-no-multiline-commands
- In Unix shells, ``\`` can be used to split a command into several lines, for example to aid readability.
- Standard Windows terminals (including the Anaconda prompt) do not support this.
- They instead use the ``^`` character:
-
- .. code-block:: bash
-
- $ wget -q https://sourceforge.net/projects/linuxcommand/files/TLCL/19.01/TLCL-19.01.pdf/download ^
- -O TLCL.pdf
-
- If you are not using the Git Bash, you will either need to copy multi-line commands into a single line, or use ``^`` (make sure that there is **no space** afterwards) instead of ``\``.
+ .. include:: topic/terminal-linecontinuation.rst
+.. index::
+ pair: download file; with wget
.. runrecord:: _examples/DL-101-102-103
:language: console
:workdir: dl-101/DataLad-101
@@ -66,28 +64,20 @@ are presented here, make sure to check the :windows-wit:`on peculiarities of its
-O TLCL.pdf
$ wget -q https://github.com/swaroopch/byte-of-python/releases/download/vadb91fc6fce27c58e3f931f5861806d3ccd1054c/byte-of-python.pdf \
-O byte-of-python.pdf
- # get back into the root of the dataset
+ $ # get back into the root of the dataset
$ cd ../
Some machines will not have :shcmd:`wget` available by default, but any command that can
download a file can work as an alternative. See the :windows-wit:`for the popular alternative
curl `.
+.. index::
+ pair: curl instead of wget; on Windows
+ pair: download file; with curl
.. windows-wit:: You can use curl instead of wget
:name: ww-curl-instead-wget
- Many versions of Windows do not ship with the tool ``wget``.
- You can install it, but it may be easier to use the pre-installed ``curl`` command:
-
- .. code-block:: bash
-
- $ cd books
- $ curl -L https://sourceforge.net/projects/linuxcommand/files/TLCL/19.01/TLCL-19.01.pdf/download \
- -o TLCL.pdf
- $ curl -L https://github.com/swaroopch/byte-of-python/releases/download/vadb91fc6fce27c58e3f931f5861806d3ccd1054c/byte-of-python.pdf \
- -o byte-of-python.pdf
- $ cd ../
-
+ .. include:: topic/curl-instead-wget.rst
Let's see what happened. First of all, in the root of ``DataLad-101``, show the directory
structure with tree:
@@ -100,7 +90,9 @@ structure with tree:
$ tree
-.. index:: ! datalad command; status
+.. index::
+ pair: status; DataLad command
+ pair: check dataset for modification; with DataLad
Now what does DataLad do with this new content? One command you will use very
often is :dlcmd:`status`.
@@ -115,7 +107,9 @@ regular status reports should become a habit in the wake of ``DataLad-101``.
$ datalad status
-.. index:: ! datalad command; save
+.. index::
+ pair: save; DataLad command
+ pair: save dataset modification; with DataLad
Interesting; the ``books/`` directory is "untracked". Remember how content
*can* be tracked *if a user wants to*?
@@ -140,7 +134,9 @@ about commit messages because :dlcmd:`save` ultimately uses the command
If you ever forget to specify a message, or made a typo, not all is lost. A
:find-out-more:`explains how to amend a saved state `.
-.. find-out-more:: "Oh no! I forgot the -m option for datalad-save!"
+.. index::
+ pair: amend commit message; with Git
+.. find-out-more:: "Oh no! I forgot the -m option for 'datalad save'!"
:name: fom-amend-save
:float:
@@ -166,10 +162,14 @@ Large content is tracked in an *annex* that is automatically
created and handled by DataLad. Whether text files or larger files change,
all of these changes can be written to your DataLad dataset's history.
+.. index::
+ pair: log; Git command
+ pair: show last commit; with Git
+
Let's see how the saved content shows up in the history of the dataset with :gitcmd:`log`.
The option ``-n 1`` specifies that we want to take a look at the most recent commit.
In order to get a bit more details, we add the ``-p`` flag. If you end up in a
-pager, navigate with up and down arrow keys and leave the log by typing ``q``:
+:term:`pager`, navigate with up and down arrow keys and leave the log by typing ``q``:
.. runrecord:: _examples/DL-101-102-107
:language: console
@@ -187,6 +187,8 @@ the directory ``books/``, and thanks to that commit message we have a nice
human-readable summary of that action. A :find-out-more:`explains what makes
a good message `.
+.. index::
+ pair: recommendation; commit message
.. find-out-more:: DOs and DON'Ts for commit messages
:name: fom-commit-message-guidance
:float: tbp
@@ -207,6 +209,8 @@ a good message `.
- Do not say nasty things about other people
+.. index::
+ pair: no staging; with DataLad
.. gitusernote:: There is no staging area in DataLad
Just as in Git, new files are not tracked from their creation on, but only when
@@ -222,7 +226,7 @@ in the same commit. And ... what happens if I have files I do not want to track?
:dlcmd:`save -m "some commit message"` would save all of what is currently
untracked or modified in the dataset into the history!"
-Regarding your first remark, you're absolutely right!
+Regarding your first remark, you are absolutely right!
It is good practice to save only those changes
together that belong together. We do not want to squish completely unrelated changes
into the same spot of our history, because it would get very nasty should we want to
@@ -262,10 +266,12 @@ Let's give :dlcmd:`save` precisely this file by specifying its path after the co
$ datalad save -m "add reference book about git" books/progit.pdf
-Regarding your second remark, you're right that a :dlcmd:`save` without a
+Regarding your second remark, you are right that a :dlcmd:`save` without a
path specification would write all of the currently untracked files or modifications
to the history. But check the :find-out-more:`on how to tell it otherwise `.
+.. index::
+ pair: save already tracked files only; with DataLad
.. find-out-more:: How to save already tracked dataset components only?
:name: fom-save-updated-only
:float:
@@ -280,13 +286,15 @@ to the history. But check the :find-out-more:`on how to tell it otherwise `_.
+The code snippet, however, contains this note within the start and end part of a
+`heredoc `_.
You can also copy the full code snippet, starting
from ``cat << EOT > notes.txt``, including the ``EOT`` in the last line, in your
terminal to write this note from the terminal (without any editor) into ``notes.txt``.
-.. find-out-more:: How does a here-document work?
+.. index:: here-document, heredoc
+.. find-out-more:: How does a heredoc (here-document) work?
- The code snippet below makes sure to write lines of text into a
+ The code snippet makes sure to write lines of text into a
file (that so far does not exist) called ``notes.txt``.
To do this, the content of the "document" is wrapped in between
@@ -49,28 +50,22 @@ terminal to write this note from the terminal (without any editor) into ``notes.
It might seem like a slightly convoluted way to create a text file with
a note in it. But it allows to write notes from the terminal, enabling
this book to create commands you can execute with nothing other than your terminal.
- You are free to copy-paste the snippets with the here-documents,
+ You are free to copy-paste the snippets with the heredocs,
or find a workflow that suites you better. The only thing important is that
you create and modify a ``.txt`` file over the course of the Basics part of this
handbook.
-Running the command below will create ``notes.txt`` in the
+Running this command will create ``notes.txt`` in the
root of your ``DataLad-101`` dataset:
+.. index:: heredoc
+ pair: heredoc; on Windows in a terminal
.. windows-wit:: Heredocs don't work under non-Git-Bash Windows terminals
- Heredocs rely on Unix-type redirection and multi-line commands -- which is not supported on most native Windows terminals or the Anaconda prompt on Windows.
- If you are using an Anaconda prompt or a Windows terminal other than Git Bash, instead of executing heredocs, please open up an editor and paste and save the text into it.
-
- The relevant text in the snippet below would be:
-
- .. code-block:: text
-
- One can create a new dataset with 'datalad create [--description] PATH'.
- The dataset is created empty
-
- If you are using Git Bash, however, here docs will work just fine.
+ .. include:: topic/heredoc-windows.rst
+.. index::
+ pair: create heredoc; in a terminal
.. runrecord:: _examples/DL-101-103-101
:language: console
:workdir: dl-101/DataLad-101
@@ -83,6 +78,9 @@ root of your ``DataLad-101`` dataset:
EOT
+.. index::
+ pair: check dataset for modification; with DataLad
+
Run :dlcmd:`status` to confirm that there is a new, untracked file:
.. runrecord:: _examples/DL-101-103-102
@@ -93,6 +91,9 @@ Run :dlcmd:`status` to confirm that there is a new, untracked file:
$ datalad status
+.. index::
+ pair: save dataset modification; with DataLad
+
Save the current state of this file in your dataset's history. Because it is the only modification
in the dataset, there is no need to specify a path.
@@ -107,7 +108,7 @@ But now, let's see how *changing* tracked content works.
Modify this file by adding another note. After all, you already know how to use
:dlcmd:`save`, so write a short summary on that as well.
-Again, the example below uses Unix commands (``cat`` and redirection, this time however
+Again, the example uses Unix commands (``cat`` and redirection, this time however
with ``>>`` to *append* new content to the existing file)
to accomplish this, but you can take any editor of your choice.
@@ -144,7 +145,7 @@ and save the file in DataLad:
$ datalad save -m "add note on datalad save"
Let's take another look into our history to see the development of this file.
-We're using :gitcmd:`log -p -n 2` to see last two commits and explore
+We are using :gitcmd:`log -p -n 2` to see last two commits and explore
the difference to the previous state of a file within each commit.
.. runrecord:: _examples/DL-101-103-107
@@ -163,7 +164,11 @@ Additions are marked with a ``+``, and deletions would be shown with a leading `
From the dataset's history, we can therefore also find out *how* the text file
evolved over time. That's quite neat, isn't it?
-.. find-out-more:: git log has many more useful options
+.. index::
+ pair: log; Git command
+ pair: get help; with Git
+ pair: filter history; with Git
+.. find-out-more:: 'git log' has many more useful options
``git log``, as many other ``Git`` commands, has a good number of options
which you can discover if you run ``git log --help``. Those options could
diff --git a/docs/basics/101-105-install.rst b/docs/basics/101-105-install.rst
index bb4837685..ad35afa9a 100644
--- a/docs/basics/101-105-install.rst
+++ b/docs/basics/101-105-install.rst
@@ -1,4 +1,6 @@
-.. index:: ! datalad command; clone
+.. index::
+ pair: clone; DataLad command
+ pair: clone dataset; with DataLad
.. _installds:
Install datasets
@@ -24,7 +26,7 @@ can share our mid-term and final projects easily!"
"But today, let's only focus on how to install a dataset", she continues.
"Damn it! Can we not have longer lectures?", you think and set alarms to all of the
upcoming lecture dates in your calendar.
-There is so much exciting stuff to come, you can not miss a single one.
+There is so much exciting stuff to come, you cannot miss a single one.
"Psst!" a student from the row behind reaches over. "There are
a bunch of audio recordings of a really cool podcast, and they have been shared in the form
@@ -34,7 +36,7 @@ of a DataLad dataset! Shall we try whether we can install that?"
now instead of looking at slides for hours is my preferred type of learning anyway",
you think as you fire up your terminal and navigate into your ``DataLad-101`` dataset.
-In this demonstration, we're using one of the many openly available datasets that
+In this demonstration, we are using one of the many openly available datasets that
DataLad provides in a public registry that anyone can access. One of these datasets is a
collection of audio recordings of a great podcast, the longnow seminar series [#f2]_.
It consists of audio recordings about long-term thinking, and while the DataLad-101
@@ -51,12 +53,10 @@ called recordings.
:cast: 01_dataset_basics
:notes: The next challenge is to clone an existing dataset from the web as a subdataset. First, we create a location for this
- # we are in the root of DataLad-101
+ $ # we are in the root of DataLad-101
$ mkdir recordings
-.. index:: ! datalad command; clone
-
The command that can be used to obtain a dataset is :dlcmd:`clone`,
but we often refer to the process of cloning a Dataset as *installing*.
Let's install the longnow podcasts in this new directory.
@@ -75,7 +75,7 @@ the podcasts as a *subdataset* of ``DataLad-101``. Because we are in the root
of the ``DataLad-101`` dataset, the pointer to the dataset is a ``.`` (which is Unix'
way of saying "current directory").
-As before with long commands, we line break the code below with a ``\``. You can
+As before with long commands, we line break the code with a ``\``. You can
copy it as it is presented here into your terminal, but in your own work you
can write commands like this into a single line.
@@ -99,12 +99,16 @@ also recorded where this dataset came from, thus capturing its *origin* as
:term:`provenance`. Even though this is not obvious at this point in time, later
chapters in this handbook will demonstrate how useful this information can be.
+.. index::
+ pair: clone; DataLad concept
.. gitusernote:: Clone internals
The :dlcmd:`clone` command uses :gitcmd:`clone`.
A dataset that is installed from an existing source, e.g., a path or URL,
is the DataLad equivalent of a *clone* in Git.
+.. index::
+ pair: clone into another dataset; with DataLad
.. find-out-more:: Do I have to install from the root of datasets?
No. Instead of from the *root* of the ``DataLad-101`` dataset, you could have also
@@ -117,18 +121,18 @@ chapters in this handbook will demonstrate how useful this information can be.
point to root of the top-most dataset. For example, if you navigate into ``recordings``,
the command would be:
- .. code-block:: bash
+ .. code-block:: console
- datalad clone -d^ https://github.com/datalad-datasets/longnow-podcasts.git longnow
+ $ datalad clone -d^ https://github.com/datalad-datasets/longnow-podcasts.git longnow
.. find-out-more:: What if I do not install into an existing dataset?
If you do not install into an existing dataset, you only need to omit the ``-d/--dataset``
option. You can try:
- .. code-block:: bash
+ .. code-block:: console
- datalad clone https://github.com/datalad-datasets/longnow-podcasts.git
+ $ datalad clone https://github.com/datalad-datasets/longnow-podcasts.git
anywhere outside of your ``DataLad-101`` dataset to install the podcast dataset into a new directory
called ``longnow-podcasts``. You could even do this inside of an existing dataset.
@@ -139,10 +143,12 @@ chapters in this handbook will demonstrate how useful this information can be.
Here is the repository structure:
-.. windows-wit:: tree -d may fail
+.. index::
+ pair: tree; terminal command
+ pair: display directory tree; on Windows
+.. windows-wit:: use tree
- If you have installed :term:`conda`\s ``m2-base`` package for access to Unix commands such as tree, you will have the tree command.
- However, this version of tree does not support the use of any command flags, so please just run ``tree`` instead of ``tree -d``.
+ .. include:: topic/tree-windows.rst
.. runrecord:: _examples/DL-101-105-103
:language: console
@@ -152,12 +158,9 @@ Here is the repository structure:
$ tree -d # we limit the output to directories
-We can see that recordings has one subdirectory, our newly installed ``longnow``
-dataset. Within the dataset are two other directories, ``Long_Now__Conversations_at_The_Interval``
-and ``Long_Now__Seminars_About_Long_term_Thinking``.
-If we navigate into one of them and list its content, we'll see many ``.mp3`` files (here is an
-excerpt).
-
+We can see that ``recordings`` has one subdirectory, our newly installed ``longnow``
+dataset with two subdirectories.
+If we navigate into one of them and list its content, we'll see many ``.mp3`` files (here is an excerpt).
.. runrecord:: _examples/DL-101-105-104
:language: console
@@ -180,7 +183,7 @@ a download of that many ``.mp3`` files not take much more time?
Here you can see another import feature of DataLad datasets
and the :dlcmd:`clone` command:
Upon installation of a DataLad dataset, DataLad retrieves only small files
-(for example text files or markdown files) and (small) metadata
+(for example, text files or markdown files) and (small) metadata
about the dataset. It does not, however, download any large files
(yet). The metadata exposes the dataset's file hierarchy
for exploration (note how you are able to list the dataset contents with ``ls``),
@@ -188,6 +191,8 @@ and downloading only this metadata speeds up the installation of a DataLad datas
of many TB in size to a few seconds. Just now, after installing, the dataset is
small in size:
+.. index::
+ pair: show file size; in a terminal
.. runrecord:: _examples/DL-101-105-105
:language: console
:workdir: dl-101/DataLad-101/recordings/longnow/Long_Now__Seminars_About_Long_term_Thinking
@@ -235,12 +240,16 @@ hard drive has much space left...
But you nevertheless are curious on how to actually listen to one of these ``.mp3``\s now.
So how does one actually "get" the files?
-.. index:: ! datalad command; get
+.. index::
+ pair: get; DataLad command
The command to retrieve file content is :dlcmd:`get`.
You can specify one or more specific files, or ``get`` all of the dataset by
specifying :dlcmd:`get .` at the root directory of the dataset (with ``.`` denoting "current directory").
+.. index::
+ pair: get file content; with DataLad
+
First, we get one of the recordings in the dataset -- take any one of your choice
(here, it's the first).
@@ -257,7 +266,7 @@ Try to open it -- it will now work.
If you would want to get the rest of the missing data, instead of specifying all files individually,
we can use ``.`` to refer to *all* of the dataset like this:
-.. code-block:: bash
+.. code-block:: console
$ datalad get .
@@ -265,6 +274,9 @@ However, with a total size of more than 15GB, this might take a while, so do not
If you did execute the command above, interrupt it by pressing ``CTRL`` + ``C`` -- Do not worry,
this will not break anything.
+.. index::
+ pair: show dataset size; with DataLad
+
Isn't that easy?
Let's see how much content is now present locally. For this, :dlcmd:`status --annex all`
has a nice summary:
@@ -298,10 +310,14 @@ DataLad summarizes the outcome of the execution of ``get`` in the end and inform
that the download of one file was ``notneeded`` and the retrieval of the other files was ``ok``.
+.. index::
+ pair: get; DataLad concept
.. gitusernote:: Get internals
:dlcmd:`get` uses :gitannexcmd:`get` underneath the hood.
+.. index::
+ pair: drop file content; with DataLad
Keep whatever you like
^^^^^^^^^^^^^^^^^^^^^^
@@ -310,14 +326,15 @@ Keep whatever you like
apparently downloaded the *full* dataset accidentally. "Is there a way to get rid
of file contents in dataset, too?", they ask. "Yes", the lecturer responds,
"you can remove file contents by using :dlcmd:`drop`. This is
-really helpful to save disk space for data you can easily re-obtain, for example".
+really helpful to save disk space for data you can easily reobtain, for example".
-.. index:: datalad command; drop
+.. index::
+ pair: drop; DataLad command
The :dlcmd:`drop` command will remove
file contents completely from your dataset.
You should only use this command to remove contents that you can :dlcmd:`get`
-again, or generate again (for example with next chapter's :dlcmd:`datalad run`
+again, or generate again (for example, with next chapter's :dlcmd:`datalad run`
command), or that you really do not need anymore.
Let's remove the content of one of the files that we have downloaded, and check
@@ -356,7 +373,7 @@ Whenever you need the recording again, it is easy to re-retrieve it:
$ datalad get Long_Now__Seminars_About_Long_term_Thinking/2004_01_10__George_Dyson__There_s_Plenty_of_Room_at_the_Top__Long_term_Thinking_About_Large_scale_Computing.mp3
-Re-obtained!
+Reobtained!
This was only a quick digression into :dlcmd:`drop`. The main principles
of this command will become clear after chapter
@@ -370,10 +387,14 @@ by using DataLad datasets -- and that really is a cool feature to have.
Dataset archeology
^^^^^^^^^^^^^^^^^^
-You have now experienced how easy it is to (re-)obtain shared data with DataLad.
+You have now experienced how easy it is to (re)obtain shared data with DataLad.
But beyond sharing only the *data* in the dataset, when sharing or installing
a DataLad dataset, all copies also include the dataset's *history*.
+.. index::
+ pair: log; Git command
+ pair: show history (reverse); with Git
+
For example, we can find out who created the dataset in the first place
(the output shows an excerpt of ``git log --reverse``, which displays the
history from first to most recent commit):
@@ -395,6 +416,10 @@ Because an installed dataset knows the dataset it was installed from,
your local dataset clone can be updated from its origin, and thus get the new recordings,
should there be some. Later in this handbook, we will see examples of this.
+.. index::
+ pair: update heredoc; in a terminal
+ pair: save dataset modification; with DataLad
+
Now you can not only create datasets and work with them locally, you can also consume
existing datasets by installing them. Because that's cool, and because you will use this
command frequently, make a note of it into your ``notes.txt``, and :dlcmd:`save` the
@@ -406,7 +431,7 @@ modification.
:cast: 01_dataset_basics
:notes: We can make a note about this:
- # in the root of DataLad-101:
+ $ # in the root of DataLad-101:
$ cd ../../
$ cat << EOT >> notes.txt
The command 'datalad clone URL/PATH [PATH]' installs a dataset from
@@ -417,17 +442,19 @@ modification.
EOT
$ datalad save -m "Add note on datalad clone"
+.. index::
+ pair: placeholder files; on Mac
.. importantnote:: Empty files can be confusing
- Listing files directly after the installation of a dataset will
- work if done in a terminal with ``ls``.
- However, certain file managers (such as OSX's Finder [#f3]_) may fail to
- display files that are not yet present locally (i.e., before a
- :dlcmd:`get` was run). Therefore, be mindful when exploring
- a dataset hierarchy with a file manager -- it might not show you
- the available but not yet retrieved files.
- Consider browsing datasets with the :term:`DataLad Gooey` to be on the safe side.
- More about why this is will be explained in section :ref:`symlink`.
+ Listing files directly after the installation of a dataset will
+ work if done in a terminal with ``ls``.
+ However, certain file managers (such as OSX's Finder [#f3]_) may fail to
+ display files that are not yet present locally (i.e., before a
+ :dlcmd:`get` was run). Therefore, be mindful when exploring
+ a dataset hierarchy with a file manager -- it might not show you
+ the available but not yet retrieved files.
+ Consider browsing datasets with the :term:`DataLad Gooey` to be on the safe side.
+ More about why this is will be explained in section :ref:`symlink`.
.. only:: adminmode
@@ -443,7 +470,7 @@ modification.
.. rubric:: Footnotes
-.. [#f1] Additionally, a source can also be a pointer to an open-data collection,
+.. [#f1] Additionally, a source can also be a pointer to an open-data collection,
for example :term:`the DataLad superdataset ///` -- more on what this is and how to
use it later, though.
diff --git a/docs/basics/101-106-nesting.rst b/docs/basics/101-106-nesting.rst
index 2cc8c4045..874e7b2ca 100644
--- a/docs/basics/101-106-nesting.rst
+++ b/docs/basics/101-106-nesting.rst
@@ -1,10 +1,10 @@
+.. index::
+ pair: dataset nesting; with DataLad
.. _nesting:
Dataset nesting
---------------
-.. index:: ! nesting
-
Without noticing, the previous section demonstrated another core principle
and feature of DataLad datasets: *Nesting*.
@@ -15,7 +15,6 @@ This was done by supplying the ``--dataset``/``-d`` flag in the command call.
At first glance, nesting does not seem particularly spectacular --
after all, any directory on a file system can have other directories inside of it.
-
The possibility for nested Datasets, however, is one of many advantages
DataLad datasets have:
@@ -42,6 +41,8 @@ looks like after the addition of a subdataset. To do this, make sure you are
addition to ``notes.txt``, so we'll look at the second most recent commit in
this excerpt.
+.. index::
+ pair: show commit patches; with Git
.. runrecord:: _examples/DL-101-106-101
:language: console
:workdir: dl-101/DataLad-101
@@ -54,7 +55,7 @@ this excerpt.
$ git log -p -n 3
We have highlighted the important part of this rather long commit summary.
-Note that you can not see any ``.mp3``\s being added to the dataset,
+Note that you cannot see any ``.mp3``\s being added to the dataset,
as was previously the case when we :dlcmd:`save`\d PDFs that we
downloaded into ``books/``. Instead,
DataLad stores what it calls a *subproject commit* of the subdataset.
@@ -91,6 +92,8 @@ This is what is meant by "the top-level DataLad dataset (the *superdataset*) onl
Importantly, once we learn how to make use of the history of a dataset,
we can set subdatasets to previous states, or *update* them.
+.. index::
+ pair: temporary working directory change; with Git
.. find-out-more:: Do I have to navigate into the subdataset to see it's history?
Previously, we used :shcmd:`cd` to navigate into the subdataset, and
@@ -99,30 +102,32 @@ we can set subdatasets to previous states, or *update* them.
While moving around with ``cd`` is straightforward, you also found it
slightly annoying from time to time to use the ``cd`` command so often and also
to remember in which directory you currently are in. There is one
- trick, though: ``git -C`` (note that it is a capital C) lets you perform any
+ trick, though: ``git -C`` and ``datalad -C`` (note that it is a capital C) let you perform any
Git or DataLad command in a provided path. Providing this option together with a path to
- a Git command let's you run the command as if Git was started in this path
+ a Git or DataLad command let's you run the command as if it was started in this path
instead of the current working directory.
Thus, from the root of ``DataLad-101``, this command would have given you the
subdataset's history as well:
- .. code-block:: bash
+ .. code-block:: console
$ git -C recordings/longnow log --oneline
In the upcoming sections, we'll experience the perks of dataset nesting
frequently, and everything that might seem vague at this point will become
clearer. To conclude this demonstration,
-the figure below illustrates the current state of our dataset, ``DataLad-101``, with its nested subdataset.
+:numref:`fignesting` illustrates the current state of our dataset, ``DataLad-101``, with its nested subdataset.
+Thus, without being consciously aware of it, by taking advantage of dataset
+nesting, we took a dataset ``longnow`` and installed it as a
+subdataset within the superdataset ``DataLad-101``.
+
+.. _fignesting:
.. figure:: ../artwork/src/virtual_dstree_dl101.svg
:width: 70%
Virtual directory tree of a nested DataLad dataset
-Thus, without being consciously aware of it, by taking advantage of dataset
-nesting, we took a dataset ``longnow`` and installed it as a
-subdataset within the superdataset ``DataLad-101``.
If you have executed the above code snippets, make sure to go back into the
root of the dataset again:
diff --git a/docs/basics/101-107-summary.rst b/docs/basics/101-107-summary.rst
index 589c854f7..c2af46e49 100644
--- a/docs/basics/101-107-summary.rst
+++ b/docs/basics/101-107-summary.rst
@@ -8,9 +8,9 @@ and making simple modifications *locally*.
to the dataset and use the ``-c text2git`` configuration, but we will see later why.
This is the command structure:
- .. code-block:: bash
+ .. code-block:: console
- datalad create --description "here is a description" -c text2git PATH
+ $ datalad create --description "here is a description" -c text2git PATH
* Thanks to :term:`Git` and :term:`git-annex`, the dataset has a history to track files and their
modifications. Built-in Git tools (:gitcmd:`log`) or external tools (such as ``tig``) allow to explore
@@ -23,9 +23,9 @@ and making simple modifications *locally*.
specifying a path, all untracked files and all file changes will be committed to the history together!
This is the command structure:
- .. code-block:: bash
+ .. code-block:: console
- datalad save -m "here is a commit message" [PATH]
+ $ datalad save -m "here is a commit message" [PATH]
* The typical local workflow is simple: *Modify* the dataset by adding or
modifying files, *save* the changes as meaningful units to the history,
@@ -48,11 +48,12 @@ and making simple modifications *locally*.
Furthermore, we have discovered the basics of installing a published DataLad dataset,
and experienced the concept of modular nesting datasets.
-.. index:: ! datalad command; clone
+.. index::
+ pair: clone; DataLad command
* A published dataset can be installed with the :dlcmd:`clone` command:
- .. code-block:: bash
+ .. code-block:: console
$ datalad clone [--dataset PATH] SOURCE-PATH/URL [DESTINATION PATH]
@@ -66,7 +67,7 @@ and experienced the concept of modular nesting datasets.
* If a dataset is installed inside of another dataset as a subdataset, the
``--dataset``/``-d`` option needs to specify the root of the containing dataset, the superdataset.
-* The source can be a URL (for example of a GitHub repository, as in section :ref:`installds`), but also
+* The source can be a URL, for example of a GitHub repository as in section :ref:`installds`, but also
paths, or open data collections.
* After :dlcmd:`clone`, only small files and metadata about file availability are present locally.
@@ -80,11 +81,11 @@ and experienced the concept of modular nesting datasets.
version of a contained subdataset through an identifier.
-Now what I can do with that?
+Now what can I do with that?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Simple, local workflows allow you to version control changing small files,
-for example your CV, your code, or a book that you are working on, but
+for example, your CV, your code, or a book that you are working on, but
you can also add very large files to your datasets history.
Currently, this can be considered "best-practice building": Frequent :dlcmd:`status`
commands, :dlcmd:`save` commands to save dataset modifications,
diff --git a/docs/basics/101-108-run.rst b/docs/basics/101-108-run.rst
index 48fde9081..309387739 100644
--- a/docs/basics/101-108-run.rst
+++ b/docs/basics/101-108-run.rst
@@ -36,12 +36,12 @@ list of speakers and titles to cross out what they've already listened
to, and ask you to prepare such a list.
"Mhh... probably there is a DataLad way to do this... wasn't there also
-a note about metadata extraction at some point?" But as we're not that
+a note about metadata extraction at some point?" But as we are not that
far into the lectures, you decide to write a short shell script
to generate a text file that lists speaker and title
name instead.
-To do this, we're following a best practice that will reappear in the
+To do this, we are following a best practice that will reappear in the
later section on :ref:`YODA principles `: Collecting all
additional scripts that work with content of a subdataset *outside*
of this subdataset, in a dedicated ``code/`` directory,
@@ -68,41 +68,17 @@ Inside of ``DataLad-101/code``, create a simple shell script ``list_titles.sh``.
This script will carry out a simple task:
It will loop through the file names of the ``.mp3`` files and
write out speaker names and talk titles in a very basic fashion.
-The content of this script is written below -- the ``cat`` command
-will write it into the script.
+The ``cat`` command will write the script content into ``code/list_titles.sh``.
.. windows-wit:: Here's a script for Windows users
- Please use an editor of your choice to create a file ``list_titles.sh`` inside of the ``code`` directory.
- These should be the contents:
-
- .. code-block:: bash
-
- for i in recordings/longnow/Long_Now__Seminars*/*.mp3; do
- # get the filename
- base=$(basename "$i");
- # strip the extension
- base=${base%.mp3};
- # date as yyyy-mm-dd
- printf "${base%%__*}\t" | tr '_' '-';
- # name and title without underscores
- printf "${base#*__}\n" | tr '_' ' ';
- done
-
- Note that this is not identical to the one below -- it lacks a few ``\`` characters, which is a meaningful difference.
+ .. include:: topic/globscript1-windows.rst
+.. index::
+ pair: hidden file name extensions; on Windows
.. windows-wit:: Be mindful of hidden extensions when creating files!
- By default, Windows does not show common file extensions when you view directory contents with a file explorer.
- Instead, it only displays the base of the file name and indicates the file type with the display icon.
- You can see if this is the case for you, too, by opening the ``books\`` directory in a file explorer, and checking if the file extension (``.pdf``) is a part of the file name displayed underneath its PDF icon.
-
- Hidden file extensions can be a confusing source of errors, because some Windows editors (for example Notepad) automatically add a ``.txt`` extension to your files -- when you save the script above under the name ``list_titles.sh``, your editor may add an extension (``list_titles.sh.txt``), and the file explorer displays your file as ``list_titles.sh`` (hiding the ``.txt`` extension).
-
- To prevent confusion, configure the file explorer to always show you the file extension.
- For this, open the Explorer, click on the "View" tab, and tick the box "File name extensions".
-
- Beyond this, double check the correct naming of your file, ideally in the terminal.
+ .. include:: topic/hidden-extensions.rst
.. runrecord:: _examples/DL-101-108-102
:language: console
@@ -152,7 +128,10 @@ in a bit of time, we will forget how this file came into existence, or
that the script ``code/list_titles.sh`` is associated with this file, and
can be used to update it later on.
-.. index:: ! datalad command; run
+.. index::
+ pair: run; DataLad command
+ pair: run command with provenance capture; with DataLad
+ pair: run command with provenance capture; with DataLad run
The :dlcmd:`run` command
can help with this. Put simply, it records a command's impact on a dataset. Put
diff --git a/docs/basics/101-109-rerun.rst b/docs/basics/101-109-rerun.rst
index 6a4c97a9c..f1966bf89 100644
--- a/docs/basics/101-109-rerun.rst
+++ b/docs/basics/101-109-rerun.rst
@@ -1,10 +1,10 @@
+.. index::
+ pair: rerun; DataLad command
.. _run2:
-DataLad, Re-Run!
+DataLad, rerun!
----------------
-.. index:: ! datalad command; rerun
-
So far, you created a ``.tsv`` file of all
speakers and talk titles in the ``longnow/`` podcasts subdataset.
Let's actually take a look into this file now:
@@ -34,21 +34,7 @@ with the following, fixed script:
.. windows-wit:: Here's a script adjustment for Windows users
- Please use an editor of your choice to replace the contents of ``list_titles.sh`` inside of the ``code`` directory with the following:
-
- .. code-block:: bash
-
- for i in recordings/longnow/Long_Now*/*.mp3; do
- # get the filename
- base=$(basename "$i");
- # strip the extension
- base=${base%.mp3};
- # date as yyyy-mm-dd
- printf "${base%%__*}\t" | tr '_' '-';
- # name and title without underscores
- printf "${base#*__}\n" | tr '_' ' ';
- done
-
+ .. include:: topic/globscript2-windows.rst
.. runrecord:: _examples/DL-101-109-102
:language: console
@@ -91,9 +77,9 @@ We can use the shorthand "BF" to denote "Bug fix" in the commit message.
What we *could* do is run the same :dlcmd:`run` command as before to recreate
the file, but now with all of the contents:
-.. code-block:: bash
+.. code-block:: console
- # do not execute this!
+ $ # do not execute this!
$ datalad run -m "create a list of podcast titles" \
"bash code/list_titles.sh > recordings/podcasts.tsv"
@@ -124,7 +110,7 @@ So you go ahead and find the commit :term:`shasum` in your history:
Take that shasum and paste it after :dlcmd:`rerun`
(the first 6-8 characters of the shasum would be sufficient,
-here we're using all of them).
+here we are using all of them).
.. runrecord:: _examples/DL-101-109-106
:language: console
@@ -158,7 +144,8 @@ we can see that a new :dlcmd:`run` was recorded. This action is
committed by DataLad under the original commit message of the ``run``
command, and looks just like the previous :dlcmd:`run` commit.
-.. index:: ! datalad command; diff
+.. index::
+ pair: diff; DataLad command
Two cool tools that go beyond the :gitcmd:`log`
are the :dlcmd:`diff` and :gitcmd:`diff` commands.
@@ -171,20 +158,17 @@ and another state from the dataset's history (a commit specified with
``-t``/``--to``). Let's do a :dlcmd:`diff` between the current state
of the dataset and the previous commit (called "``HEAD~1``" in Git terminology [#f1]_):
-.. windows-wit:: please use datalad diff --from main --to HEAD~1
+.. index::
+ pair: show dataset modification; on Windows with DataLad
+ pair: diff; DataLad command
+ pair: corresponding branch; in adjusted mode
+.. windows-wit:: please use 'datalad diff --from main --to HEAD~1'
- While this example works on Unix file systems, it will not provide the same output on Windows.
- This is due to different file handling on Windows.
- When executing this command, you will see *all* files being modified between the most recent and the second-most recent commit.
- On a technical level, this is correct given the underlying file handling on Windows, and chapter :ref:`chapter_gitannex` will shed light on why that is.
+ .. include:: topic/adjustedmode-diff.rst
- For now, to get the same output as shown in the code snippet below, use the following command where ``main`` (or ``master``) is the name of your default branch:
-
- .. code-block:: bash
-
- datalad diff --from main --to HEAD~1
-
- The ``--from`` argument specifies a different starting point for the comparison - the ``main`` or :term:`master` :term:`branch`, which would be the starting point on most Unix-based systems.
+.. index::
+ pair: diff; Git command
+ pair: show dataset modification; with DataLad
.. runrecord:: _examples/DL-101-109-108
:language: console
@@ -194,6 +178,10 @@ of the dataset and the previous commit (called "``HEAD~1``" in Git terminology [
$ datalad diff --to HEAD~1
+.. index::
+ pair: diff; Git command
+ pair: show dataset modification; with Git
+
This indeed shows the output file as "modified". However, we do not know
what exactly changed. This is a task for :gitcmd:`diff` (get out of the
diff view by pressing ``q``):
@@ -239,7 +227,7 @@ Finally, save this note.
Note that :dlcmd:`rerun` can re-execute the run records of both a :dlcmd:`run`
or a :dlcmd:`rerun` command,
-but not with any other type of datalad command in your history
+but not with any other type of DataLad command in your history
such as a :dlcmd:`save` on results or outputs after you executed a script.
Therefore, make it a
habit to record the execution of scripts by plugging it into :dlcmd:`run`.
@@ -256,12 +244,16 @@ other tools than from the machine-readable ``run record``.
For example, to find out who (or what) created or modified a file,
give the file path to :gitcmd:`log` (prefixed by ``--``):
-.. windows-wit:: use "git log main -- recordings/podcasts.tsv"
+.. index::
+ pair: show history for particular paths; on Windows with Git
+ pair: log; Git command
+ pair: corresponding branch; in adjusted mode
+.. windows-wit:: use 'git log main -- recordings/podcasts.tsv'
- A previous Windows Wit already advised to append ``main`` or ``master``, the common "default :term:`branch`", to any command that starts with ``git log``.
- Here, the last part of the command specifies a file (``-- recordings/podcasts.tsv``).
- Please append ``main`` or ``master`` to ``git log``, prior to the file specification.
+ .. include:: topic/adjustedmode-log-path.rst
+.. index::
+ pair: show history for particular paths; with Git
.. runrecord:: _examples/DL-101-109-112
:language: console
:workdir: dl-101/DataLad-101
diff --git a/docs/basics/101-110-run2.rst b/docs/basics/101-110-run2.rst
index 3866dbf07..56fea21d8 100644
--- a/docs/basics/101-110-run2.rst
+++ b/docs/basics/101-110-run2.rst
@@ -39,14 +39,14 @@ in the hidden paths
.. runrecord:: _examples/DL-101-110-101
:language: console
:workdir: dl-101/DataLad-101
- :notes: We saw a very simple datalad run. Now we're going to extend it with useful options. Narrative: prepare talk about dataset, add logo to slides. For this, we'll try to resize a logo in the meta data of the subdataset
+ :notes: We saw a very simple datalad run. Now we are going to extend it with useful options. Narrative: prepare talk about dataset, add logo to slides. For this, we'll try to resize a logo in the meta data of the subdataset
:cast: 02_reproducible_execution
$ ls recordings/longnow/.datalad/feed_metadata/*jpg
For the slides you decide to prepare images of size 400x400 px, but
the logos' original size is much larger (both are 3000x3000 pixel). Therefore
-let's try to resize the images -- currently, they're far too large to fit on a slide.
+let's try to resize the images -- currently, they are far too large to fit on a slide.
To resize an image from the command line we can use the Unix
command ``convert -resize`` from the `ImageMagick tool `_.
@@ -56,12 +56,12 @@ resized image will be saved.
To resize one image to 400x400 px, the command would thus be
``convert -resize 400x400 path/to/file.jpg path/to/newfilename.jpg``.
+.. index::
+ pair: install ImageMagick; on Windows
+ single: installation; ImageMagick
.. windows-wit:: Tool installation
- `ImageMagick `_ is not installed on Windows systems by default.
- To use it, you need to install it, using the provided `Windows Binary Release on the Download page `_.
- During installation, it is important to install the tool into a place where it is easily accessible to your terminal, for example the ``Program Files`` folder.
- Do also make sure to tick the box "install legacy commands" in the installation wizard.
+ .. include:: topic/installation-imagemagick.rst
Remembering the last lecture on :dlcmd:`run`, you decide to plug this into
:dlcmd:`run`. Even though this is not a script, it is a command, and you can wrap
@@ -70,6 +70,8 @@ Because they will be quite long, we line break the commands in the upcoming exam
for better readability -- in your terminal, you can always write the commands into
a single line.
+.. index::
+ pair: run command with provenance capture; with DataLad run
.. runrecord:: _examples/DL-101-110-102
:language: console
:workdir: dl-101/DataLad-101
@@ -93,7 +95,7 @@ A :dlcmd:`run` error message has several parts. The first starts after
``[INFO ] == Command start (output follows) =====``.
This is displaying errors that the
-terminal command threw: The ``convert`` tool complains that it can not open
+terminal command threw: The ``convert`` tool complains that it cannot open
the file, because there is "No such file or directory".
The second part starts after
@@ -111,14 +113,17 @@ How can that be?
Just as the ``.mp3`` files, the ``.jpg`` file content is not present
locally after a :dlcmd:`clone`, and we did not :dlcmd:`get` it yet!
-This is where the ``-i``/``--input`` option for a datalad run becomes useful.
+.. index::
+ pair: declare command input; with DataLad run
+
+This is where the ``-i``/``--input`` option for a ``datalad run`` becomes useful.
The content of everything that is specified as an ``input`` will be retrieved
prior to running the command.
.. runrecord:: _examples/DL-101-110-103
:language: console
:workdir: dl-101/DataLad-101
- :emphasize-lines: 8, 11, 13
+ :emphasize-lines: 8
:realcommand: datalad run --input "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" "convert -resize 400x400 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
:notes: The problem is that the content (logo) is not yet retrieved. The --input option makes sure that all content is retrieved prior to command execution.
:cast: 02_reproducible_execution
@@ -126,7 +131,7 @@ prior to running the command.
$ datalad run -m "Resize logo for slides" \
--input "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" \
"convert -resize 400x400 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
- # or shorter:
+ $ # or shorter:
$ datalad run -m "Resize logo for slides" \
-i "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" \
"convert -resize 400x400 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
@@ -142,39 +147,33 @@ that :dlcmd:`get` will only retrieve content if
it is not yet present, all input already downloaded will not be downloaded again -- so
specifying inputs even though they are already present will not do any harm.
+.. index::
+ pair: path globbing; with DataLad run
.. find-out-more:: What if there are several inputs?
Often, a command needs several inputs. In principle, every input (which could be files, directories, or subdatasets) gets its own ``-i``/``--input``
flag. However, you can make use of :term:`globbing`. For example,
- .. code-block:: bash
+ .. code-block:: console
- datalad run --input "*.jpg" "COMMAND"
+ $ datalad run --input "*.jpg" "COMMAND"
will retrieve all ``.jpg`` files prior to command execution.
If outputs already exist...
^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. index::
+ pair: files are unlocked by default; on Windows
+ pair: unlocked files; in adjusted mode
.. windows-wit:: Good news! Here is something that is easier on Windows
- The section below describes something that is very confusing for people that have just started with DataLad: Some files in a dataset can't be modified, and if one tries, it results in a "permission denied" error.
- Why is that?
- The remainder of this section and the upcoming chapter :ref:`chapter_gitannex` contain a procedural explanation.
- However: This doesn't happen on Windows.
- The "unlocking" that is necessary on almost all other systems to modify a file is already done on Windows.
- Thus, all files in your dataset will be readily modifiable, sparing you the need to adjust to the unexpected behavior that is described below.
- While it is easier, it isn't a "more useful" behavior, though.
- A different Windows Wit in the next chapter will highlight how it rather is a suboptimal workaround.
-
- Please don't skip the next section -- it is useful to know how datasets behave on other systems.
- Just be mindful that you will not encounter the errors that the handbook displays next.
- And while this all sounds quite cryptic and vague, an upcoming Windows Wit will provide more information.
+ .. include:: topic/adjustedmode-unlockedfiles.rst
Looking at the resulting image, you wonder whether 400x400 might be a tiny bit to small.
Maybe we should try to resize it to 450x450, and see whether that looks better?
-Note that we can not use a :dlcmd:`rerun` for this: if we want to change the dimension option
+Note that we cannot use a :dlcmd:`rerun` for this: if we want to change the dimension option
in the command, we have to define a new :dlcmd:`run` command.
To establish best-practices, let's specify the input even though it is already present:
@@ -183,7 +182,7 @@ To establish best-practices, let's specify the input even though it is already p
.. runrecord:: _examples/DL-101-110-104
:language: console
:workdir: dl-101/DataLad-101
- :emphasize-lines: 10
+ :emphasize-lines: 9
:realcommand: datalad run --input "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" "convert -resize 450x450 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
:notes: Maybe 400x400 is too small. We should try 450x450. Can we use a datalad rerun for this? (no)
:exitcode: 1
@@ -192,7 +191,7 @@ To establish best-practices, let's specify the input even though it is already p
$ datalad run -m "Resize logo for slides" \
--input "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" \
"convert -resize 450x450 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
- # or shorter:
+ $ # or shorter:
$ datalad run -m "Resize logo for slides" \
-i "recordings/longnow/.datalad/feed_metadata/logo_salt.jpg" \
"convert -resize 450x450 recordings/longnow/.datalad/feed_metadata/logo_salt.jpg recordings/salt_logo_small.jpg"
@@ -222,7 +221,7 @@ DataLad usually gives content to :term:`git-annex` to store and track.
git-annex, let's just say, takes this task *really* seriously. One of its
features that you have just experienced is that it *locks* content.
-If files are *locked down*, their content can not be modified. In principle,
+If files are *locked down*, their content cannot be modified. In principle,
that's not a bad thing: It could be your late grandma's secret cherry-pie
recipe, and you do not want to *accidentally* change that.
Therefore, a file needs to be consciously *unlocked* to apply modifications.
@@ -231,17 +230,20 @@ In the attempt to resize the image to 450x450 you tried to overwrite
``recordings/salt_logo_small.jpg``, a file that was given to DataLad
and thus protected by git-annex.
-.. index:: ! datalad command; unlock
+.. index::
+ pair: unlock; DataLad command
+ pair: unlock file; with DataLad
There is a DataLad command that takes care of unlocking file content,
and thus making locked files modifiable again: :dlcmd:`unlock`.
Let us check out what it does:
+.. index::
+ pair: files are unlocked by default; on Windows
+ single: adjusted branch; unlocked files
.. windows-wit:: What happens if I run this on Windows?
- Nothing. All of the files in your dataset are always unlocked, and actually *can not* be locked at all.
- Consequently, there will be nothing to show for ``datalad status`` afterwards (as shown a few paragraphs below).
- This is due to a file system limitation, and will be explained in more detail in chapter :ref:`chapter_gitannex`.
+ .. include:: topic/adjustedmode-unlockedfiles2.rst
.. runrecord:: _examples/DL-101-111-101
:language: console
@@ -304,12 +306,13 @@ command execution. Therefore, whenever the output of a :dlcmd:`run` command alre
exists and is tracked, it should be specified as an argument in
the ``-o``/``--output`` option.
+.. index::
+ pair: path globbing; with DataLad run
.. find-out-more:: But what if I have a lot of outputs?
The use case here is simplistic -- a single file gets modified.
But there are commands and tools that create full directories with
- many files as an output, for example
- `FSL `_, a neuro-imaging tool.
+ many files as an output.
The easiest way to specify this type of output
is by supplying the directory name, or the directory name and a :term:`globbing` character, such as
``-o directory/*.dat``.
@@ -317,19 +320,24 @@ the ``-o``/``--output`` option.
To glob for files in multiple levels of directories, use ``**`` (a so-called `globstar `_) for a recursive glob through any number directories.
And, just as for ``-i``/``--input``, you could use multiple ``--output`` specifications.
+.. index::
+ pair: declare command output; with DataLad run
+
In order to execute :dlcmd:`run` with both the ``-i``/``--input`` and ``-o``/``--output``
flag and see their magic, let's crop the second logo, ``logo_interval.jpg``:
+.. index::
+ pair: files are unlocked by default; on Windows
+ pair: run; DataLad command
+ pair: unlocked files; in adjusted mode
.. windows-wit:: Wait, would I need to specify outputs, too?
- Given that nothing in your dataset is locked, is there a *need* for you to bother with creating ``--output`` flags?
- Not for you personally, if you only stay on your Windows machine.
- However, you will be doing others that you share your dataset with a favor if they are not using Windows -- should you or others want to rerun a run record, ``--output`` flags will make it work on all operating systems.
+ .. include:: topic/adjustedmode-unlockedfiles-output.rst
.. runrecord:: _examples/DL-101-111-105
:language: console
:workdir: dl-101/DataLad-101
- :emphasize-lines: 14, 16
+ :emphasize-lines: 11
:realcommand: datalad run --input "recordings/longnow/.datalad/feed_metadata/logo_interval.jpg" --output "recordings/interval_logo_small.jpg" "convert -resize 450x450 recordings/longnow/.datalad/feed_metadata/logo_interval.jpg recordings/interval_logo_small.jpg"
:notes: but it is way easier to just use the --output option of datalad run: it takes care of unlocking if necessary
:cast: 02_reproducible_execution
@@ -339,7 +347,7 @@ flag and see their magic, let's crop the second logo, ``logo_interval.jpg``:
--output "recordings/interval_logo_small.jpg" \
"convert -resize 450x450 recordings/longnow/.datalad/feed_metadata/logo_interval.jpg recordings/interval_logo_small.jpg"
- # or shorter:
+ $ # or shorter:
$ datalad run -m "Resize logo for slides" \
-i "recordings/longnow/.datalad/feed_metadata/logo_interval.jpg" \
-o "recordings/interval_logo_small.jpg" \
@@ -407,7 +415,7 @@ for inputs and outputs. This is how it works:
Instead of running
-.. code-block:: bash
+.. code-block:: console
$ datalad run -m "Resize logo for slides" \
--input "recordings/longnow/.datalad/feed_metadata/logo_interval.jpg" \
@@ -416,7 +424,7 @@ Instead of running
you could shorten this to
-.. code-block:: bash
+.. code-block:: console
:emphasize-lines: 4
$ datalad run -m "Resize logo for slides" \
@@ -429,11 +437,13 @@ the placeholder ``{outputs}`` will expand to the path given as ``--output``.
This means instead of writing the full paths in the command, you can simply reuse
the ``--input`` and ``--output`` specification done before.
+.. index::
+ pair: multiple command inputs; with DataLad run
.. find-out-more:: What if I have multiple inputs or outputs?
If multiple values are specified, e.g., as in
- .. code-block:: bash
+ .. code-block:: console
$ datalad run -m "move a few files around" \
--input "file1" --input "file2" --input "file3" \
@@ -442,7 +452,7 @@ the ``--input`` and ``--output`` specification done before.
the values will be joined by a space like this:
- .. code-block:: bash
+ .. code-block:: console
$ datalad run -m "move a few files around" \
--input "file1" --input "file2" --input "file3" \
@@ -454,7 +464,7 @@ the ``--input`` and ``--output`` specification done before.
If you use globs for input specification, as in
- .. code-block:: bash
+ .. code-block:: console
$ datalad run -m "move a few files around" \
--input "file*" \
@@ -463,7 +473,7 @@ the ``--input`` and ``--output`` specification done before.
the globs will expanded in alphabetical order (like bash):
- .. code-block:: bash
+ .. code-block:: console
$ datalad run -m "move a few files around" \
--input "file1" --input "file2" --input "file3" \
@@ -474,12 +484,16 @@ the ``--input`` and ``--output`` specification done before.
can be accessed with an integer index, e.g., ``{inputs[0]}`` for the very first
input.
-.. find-out-more:: ... wait, what if I need a curly bracket in my datalad run call?
+.. index::
+ pair: run command with curly brackets; with DataLad run
+.. find-out-more:: ... wait, what if I need a curly bracket in my 'datalad run' call?
If your command call involves a ``{`` or ``}`` character, you will need to escape
this brace character by doubling it, i.e., ``{{`` or ``}}``.
+.. index::
+ pair: dry-run; with DataLad run
.. _dryrun:
Dry-running your run call
@@ -504,5 +518,4 @@ Apart from displaying the command that will be ran, you will learn *where* the c
.. [#f1] In shell programming, commands exit with a specific code that indicates
whether they failed, and if so, how. Successful commands have the exit code zero. All failures
- have exit codes greater than zero. A few lines lower, DataLad tells us the specific error
- code: The command failed with exit code 1.
+ have exit codes greater than zero.
diff --git a/docs/basics/101-112-run4.rst b/docs/basics/101-112-run4.rst
index c06ebbd38..38b10b9f4 100644
--- a/docs/basics/101-112-run4.rst
+++ b/docs/basics/101-112-run4.rst
@@ -101,6 +101,9 @@ to a :dlcmd:`run`:
EOT
+.. index::
+ pair: run command on dirty dataset; with DataLad run
+
A way of executing a :dlcmd:`run` *despite* an "unclean" dataset,
though, is to add the ``--explicit`` flag to :dlcmd:`run`.
We will try this flag with the remaining ``logo_salt.jpg``. Note that
@@ -125,6 +128,8 @@ It does not warn if the repository is dirty, but importantly, it
**only** saves modifications to the *listed outputs* (which is a problem in the
vast amount of cases where one does not exactly know which outputs are produced).
+.. index::
+ pair: explicit input/output declaration; with DataLad run
.. importantnote:: Put explicit first!
The ``--explicit`` flag has to be given anywhere *prior* to the command that
diff --git a/docs/basics/101-113-summary.rst b/docs/basics/101-113-summary.rst
index 93b9f4806..02b9d55f1 100644
--- a/docs/basics/101-113-summary.rst
+++ b/docs/basics/101-113-summary.rst
@@ -12,7 +12,7 @@ command, and discovered the concept of *locked* content.
track of what you do in your dataset by capturing all :term:`provenance`.
* A :dlcmd:`run` command generates a ``run record`` in the commit. This :term:`run record` can be used
- by datalad to re-execute a command with :dlcmd:`rerun SHASUM`, where SHASUM is the
+ by DataLad to re-execute a command with :dlcmd:`rerun SHASUM`, where SHASUM is the
commit hash of the :dlcmd:`run` command that should be re-executed.
* If a :dlcmd:`run` or :dlcmd:`rerun` does not modify any content, it will not write a
@@ -22,7 +22,7 @@ command, and discovered the concept of *locked* content.
to the executed command (using the ``-i``/``--input`` flag) and/or its output (using the ``-o``/
``--output`` flag). The full command structure is:
- .. code-block:: bash
+ .. code-block:: console
$ datalad run -m "commit message here" --input "path/to/input/" --output "path/to/output" "command"
@@ -47,7 +47,7 @@ command, and discovered the concept of *locked* content.
should be "clean"), or the command needs to be extended with the ``--explicit`` option.
-Now what I can do with that?
+Now what can I do with that?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You have procedurally experienced how to use :dlcmd:`run` and :dlcmd:`rerun`. Both
diff --git a/docs/basics/101-114-txt2git.rst b/docs/basics/101-114-txt2git.rst
index b9bf6ed6a..3105749e9 100644
--- a/docs/basics/101-114-txt2git.rst
+++ b/docs/basics/101-114-txt2git.rst
@@ -15,15 +15,17 @@ I was able to modify this despite giving it to DataLad to track, with
no permission denied errors whatsoever! How does that work?"
This night, though, your question stays unanswered and you fall into a restless
-sleep filled with bad dreams about "permission denied" errors. The next day you're
+sleep filled with bad dreams about "permission denied" errors. The next day you are
the first student in your lecturer's office hours.
-"Oh, you're really attentive. This is a great question!" our lecturer starts
+"Oh, you are really attentive. This is a great question!" our lecturer starts
to explain.
.. figure:: ../artwork/src/teacher.svg
:width: 50%
+.. index:: ! dataset procedure; text2git
+
Do you remember that we created the ``DataLad-101`` dataset with a
specific configuration template? It was the ``-c text2git`` option we
provided in the beginning of :ref:`createDS`. It is because of this configuration
@@ -50,13 +52,15 @@ Well, procedurally it means that everything that is stored in git-annex is
content-locked, and everything that is stored in Git is not. You can modify
content stored in Git straight away, without unlocking it first.
+.. _fig-gitvsannex:
+
.. figure:: ../artwork/src/git_vs_gitannex.svg
:alt: A simplified illustration of content lock in files managed by git-annex.
:width: 50%
A simplified overview of the tools that manage data in your dataset.
-That's easy enough.
+That's easy enough, and illustrated in :numref:`fig-gitvsannex`.
"So, first of all: If we hadn't provided the ``-c text2git`` argument, text files
would get content-locked, too?". "Yes, indeed. However, there are also ways to
@@ -83,6 +87,9 @@ complexity of this completely if ``-o/--output`` is specified properly, and
:dlcmd:`unlock` commands can be used to unlock content "by hand" if
modifications are performed outside of a :dlcmd:`run`.
+.. index::
+ pair: adjusted mode; git-annex concept
+
But there comes the second, tricky part: There are ways to get rid of locking and
unlocking within git-annex, using so-called :term:`adjusted branch`\es.
This functionality is dependent on the git-annex version one has installed, the git-annex version of the repository, and a use-case dependent comparison of the pros and cons.
diff --git a/docs/basics/101-115-symlinks.rst b/docs/basics/101-115-symlinks.rst
index 80e8125b3..1b8de56e1 100644
--- a/docs/basics/101-115-symlinks.rst
+++ b/docs/basics/101-115-symlinks.rst
@@ -5,7 +5,7 @@ Data integrity
--------------
So far, we mastered quite a number of challenges:
-Creating and populating a dataset with large and small files, modifying content and saving the changes to history, installing datasets, even as subdatasets within datasets, recording the impact of commands on a dataset with the run and re-run commands, and capturing plenty of :term:`provenance` on the way.
+Creating and populating a dataset with large and small files, modifying content and saving the changes to history, installing datasets, even as subdatasets within datasets, recording the impact of commands on a dataset with the :dlcmd:`run` and :dlcmd:`rerun` commands, and capturing plenty of :term:`provenance` on the way.
We further noticed that when we modified content in ``notes.txt`` or ``list_titles.sh``, the modified content was in a *text file*.
We learned that this precise type of file, in conjunction with the initial configuration template ``text2git`` we gave to :dlcmd:`create`, is meaningful:
As the text file is stored in Git and not git-annex, no content unlocking is necessary.
@@ -18,12 +18,15 @@ You might have noticed already that an ``ls -l`` or ``tree`` command in your dat
Maybe your shell also displays these files in a different color than text files when listing them.
We'll take a look together, using the ``books/`` directory as an example:
-.. windows-wit:: This will look different to you
+.. index::
+ pair: no symlinks; on Windows
+ pair: tree; terminal command
+.. windows-wit:: Dataset directories look different on Windows
- First of all, the ``tree`` equivalent provided by :term:`conda`\s ``m2-base`` package doesn't list individual files, only directories.
- And, secondly, even if you list the individual files (e.g., with ``ls -l``), you would not see the :term:`symlink`\s shown below.
- Due to insufficient support of symlinks on Windows, git-annex does not use them.
- Please read on for a basic understanding of how git-annex usually works -- a Windows Wit at the end of this section will then highlight the difference in functionality on Windows.
+ First of all, the Windows ``tree`` command lists only directories by default, unless you parametrize it with ``/f``.
+ And, secondly, even if you list the individual files, you would not see the :term:`symlink`\s shown below.
+ Due to insufficient support for symlinks on Windows, git-annex does not use them.
+ The :windows-wit:`on git-annex's adjusted mode ` has more on that.
.. runrecord:: _examples/DL-101-115-101
:language: console
@@ -31,7 +34,7 @@ We'll take a look together, using the ``books/`` directory as an example:
:notes: We have to talk about symlinks now.
:cast: 03_git_annex_basics
- # in the root of DataLad-101
+ $ # in the root of DataLad-101
$ cd books
$ tree
@@ -72,7 +75,7 @@ defined based on
#. file size
-#. and/or path/pattern, and thus for example file extensions,
+#. and/or path/pattern, and thus, for example, file extensions,
or names, or file types (e.g., text files, as with the
``text2git`` configuration template).
@@ -84,43 +87,15 @@ creates a symlink with the original file name, pointing to the new location.
This process is often referred to as a file being *annexed*, and the object
tree is also known as the *annex* of a dataset.
-.. windows-wit:: What happens on Windows?
- :name: woa_objecttree
+.. index::
+ pair: elevated storage demand; in adjusted mode
+ pair: no symlinks; on Windows
+ pair: adjusted mode; on Windows
+.. windows-wit:: File content management on Windows (adjusted mode)
+ :name: ww-adjusted-mode
:float:
- Windows has insufficient support for :term:`symlink`\s and revoking write :term:`permissions` on files.
- Therefore, :term:`git-annex` classifies it as a :term:`crippled filesystem` and has to stray from its default behavior.
- While git-annex on Unix-based file operating systems stores data in the annex and creates a symlink in the data's original place, on Windows it moves data into the :term:`annex` and creates a *copy* of the data in its original place.
-
- **Why is that?**
- Data *needs* to be in the annex for version control and transport logistics -- the annex is able to store all previous versions of the data, and manage the transport to other storage locations if you want to publish your dataset.
- But as the :ref:`Findoutmore in this section ` will show, the :term:`annex` is a non-human readable tree structure, and data thus also needs to exist in its original location.
- Thus, it exists in both places: it has moved into the annex, and copied back into its original location.
- Once you edit an annexed file, the most recent version of the file is available in its original location, and past versions are stored and readily available in the annex.
- If you reset your dataset to a previous state (as is shown in the section :ref:`history`), the respective version of your data is taken from the annex and copied to replace the newer version, and vice versa.
-
- **But doesn't a copy mean data duplication?**
- Yes, absolutely!
- And that is a big downside to DataLad and :term:`git-annex` on Windows.
- If you have a dataset with annexed file contents (be that a dataset you created and populated yourself, or one that you cloned and got file contents with ``datalad get`` from), it will take up more space than on a Unix-based system.
- How much more?
- Every file that exists in your file hierarchy exists twice.
- A fresh dataset with one version of each file is thus twice as big as it would be on a Linux computer.
- Any past version of data does not exist in duplication.
-
- **Step-by-step demonstration**:
- Let's take a concrete example to explain the last point in more detail.
- How much space, do you think, is taken up in your dataset by the resized ``salt_logo_small.jpg`` image?
- As a reminder: It exists in two versions, a 400 by 400 pixel version (about 250Kb in size), and a 450 by 450 pixel version (about 310Kb in size).
- The 400 by 400 pixel version is the most recent one.
- The answer is: about 810Kb (~0.1Mb).
- The most recent 400x400px version exists twice (in the annex and as a copy), and the 450x450px copy exists once in the annex.
- If you would reset your dataset to the state when we created the 450x450px version, this file would instead exist twice.
-
- **Can I at least get unused or irrelevant data out of the dataset?**
- Yes, either with convenience commands (e.g., ``git annex unused`` followed by ``git annex dropunused``), or by explicitly using ``drop`` on files (or their past versions) that you don't want to keep anymore.
- Alternatively, you can transfer data you don't need but want to preserve to a different storage location.
- Later parts of the handbook will demonstrate each of these alternatives.
+ .. include:: topic/adjustedmode-nosymlinks.rst
For a demonstration that this file path is not complete gibberish,
take the target path of any of the book's symlinks and
@@ -135,7 +110,7 @@ open it, for example with ``evince ``, or any other PDF reader in exchange
Even though the path looks cryptic, it works and opens the file. Whenever you
-use a command like ``evince TLCL.pdf``, internally, your shell will follow
+use a command like ``evince TLCL.pdf``, internally, programs will follow
the same cryptic symlink like the one you have just opened.
But *why* does this symlink-ing happen? Up until now, it still seems like a very
@@ -168,15 +143,15 @@ This comes with two very important advantages:
One, should you have copies of the
same data in different places of your dataset, the symlinks of these files
-point to the same place (in order to understand why this is the case, you
-will need to read the hidden section at the end of the page).
+point to the same place - in order to understand why this is the case, you
+will need to read the :find-out-more:`about the object tree `.
Therefore, any amount of copies of a piece of data
is only one single piece of data in your object tree. This, depending on
how much identical file content lies in different parts of your dataset,
can save you much disk space and time.
The second advantage is less intuitive but clear for users familiar with Git.
-Small symlinks can be written very very fast when switching :term:`branch`\es, as opposed to copying and deleting huge data files.
+Compared to copying and deleting huge data files, small symlinks can be written very very fast, for example, when switching dataset versions, or :term:`branch`\es.
.. gitusernote:: Speedy branch switches
@@ -197,34 +172,36 @@ work with the paths in the object tree than you or any other human are.
Lastly, understanding that annexed files in your dataset are symlinked
will be helpful to understand how common file system operations such as
moving, renaming, or copying content translate to dataset modifications
-in certain situations. Later in this book we will have a section on how
-to manage the file system in a DataLad dataset (:ref:`filesystem`).
+in certain situations. Later in this book, the section :ref:`file system`
+will take a closer look at that.
-
-.. find-out-more:: more about paths, checksums, object trees, and data integrity
+.. _objecttree:
+.. index::
+ pair: key; git-annex concept
+.. find-out-more:: Data integrity and annex keys
:name: fom-objecttree
So how do these cryptic paths and names in the object tree come into existence?
It's not malicious intent that leads to these paths and file names - its checksums.
- When a file is annexed, git-annex generates a *key* (or :term:`checksum`) from the **file content**.
+ When a file is annexed, git-annex typically generates a *key* (or :term:`annex key`) from the **file content**.
It uses this key (in part) as a name for the file and as the path
in the object tree.
Thus, the key is associated with the content of the file (the *value*),
- and therefore, using this key, file content can be identified --
- or rather: Based on the keys, it can be identified whether file content changed,
- and whether two files have identical contents.
+ and therefore, using this key, file content can be identified.
- The key is generated using *hashes*. A hash is a function that turns an
- input (e.g., a PDF file) into a string of characters with a fixed length based on its contents.
+ Most key types contain a :term:`checksum`. This is a string of a fixed number of characters
+ computed from some input, for example the content of a PDF file,
+ by a *hash* function.
- Importantly, a hash function will generate the same character sequence for the same file content, and once file content changes, the generated hash changes, too.
+ This checksum *uniquely* identifies a file's content.
+ A hash function will generate the same character sequence for the same file content, and once file content changes, the generated checksum changes, too.
Basing the file name on its contents thus becomes a way of ensuring data integrity:
- File content can not be changed without git-annex noticing, because file's hash, and thus its key in its symlink, will change.
- Furthermore, if two files have identical hashes, the content in these files is identical.
+ File content cannot be changed without git-annex noticing, because the file's checksum, and thus its key in its symlink, will change.
+ Furthermore, if two files have identical checksums, the content in these files is identical.
Consequently, if two files have the same symlink, and thus link the same file in the object-tree, they are identical in content.
This can save disk space if a dataset contains many identical files: Copies of the same data only need one instance of that content in the object tree, and all copies will symlink to it.
- If you want to read more about the computer science basics about hashes check out the `Wikipedia page `_.
+ If you want to read more about the computer science basics about hash functions check out the `Wikipedia page `_.
.. runrecord:: _examples/DL-101-115-104
:language: console
@@ -233,52 +210,56 @@ to manage the file system in a DataLad dataset (:ref:`filesystem`).
:notes: how does the symlink relate to the shasum of the file?
:cast: 03_git_annex_basics
- # take a look at the last part of the target path:
+ $ # take a look at the last part of the target path:
$ ls -lh TLCL.pdf
Let's take a closer look at the structure of the symlink.
The key from the hash function is the last part of the name of the file the symlink links to (in which the actual data content is stored).
+ .. index::
+ pair: compute checksum; in a terminal
.. runrecord:: _examples/DL-101-115-105
:language: console
:workdir: dl-101/DataLad-101/books
:notes: let's look at how the shasum would look like
:cast: 03_git_annex_basics
- # compare it to the checksum (here of type md5sum) of the PDF file and the subdirectory name
+ $ # compare it to the checksum (here of type md5sum) of the PDF file and the subdirectory name
$ md5sum TLCL.pdf
- The extension (e.g., ``.pdf``) is appended because some operating systems (*ehem*, Windows) need this information in order to select the right software to open a file.
+ The extension (e.g., ``.pdf``) is appended, because some programs require it, and would fail when not working directly with the symlink, but the file that it points to.
Right at the beginning, the symlink starts with two directories just after ``.git/annex/objects/``,
consisting of two letters each.
These two letters are derived from the md5sum of the key, and their sole purpose to exist is to avoid issues with too many files in one directory (which is a situation that certain file systems have problems with).
The next subdirectory in the symlink helps to prevent accidental deletions and changes, as it does not have write :term:`permissions`, so that users cannot modify any of its underlying contents.
- This is the reason that annexed files need to be unlocked prior to modifications, and this information will be helpful to understand some file system management operations such as removing files or datasets (see section :ref:`filesystem`).
+ This is the reason that annexed files need to be unlocked prior to modifications, and this information will be helpful to understand some file system management operations such as removing files or datasets. Section :ref:`file system` takes a look at that.
- The next part of the symlink contains the actual hash.
- There are different hash functions available.
+ The next part of the symlink contains the actual checksum.
+ There are different :term:`annex key` backends that use different checksums.
Depending on which is used, the resulting :term:`checksum` has a certain length and structure, and the first part of the symlink actually states which hash function is used.
- By default, DataLad uses ``MD5E`` checksums (relatively short and with a file extension), but should you want to, you can change this default to `one of many other types `_.
- The reason why MD5E is used is because of its short length -- thus it is possible to ensure cross-platform compatibility and share datasets also with users on operating systems that have restrictions on total path lengths, such as Windows.
+ By default, DataLad uses the ``MD5E`` git-annex backend (the ``E`` adds file extensions to annex keys), but should you want to, you can change this default to `one of many other types `_.
+ The reason why MD5E is used is the relatively short length of the underlying MD5 checksums -- which facilitates cross-platform compatibility for sharing datasets also with users on operating systems that have restrictions on total path length, such as Windows.
The one remaining unidentified bit in the file name is the one after the checksum identifier.
This part is the size of the content in bytes.
- An annexed file in the object tree thus has a file name following this structure:
+ An annexed file in the object tree thus has a file name following this structure
+ (but see `the git-annex documentation on keys `_ for the complete details):
- ``checksum-identifier - size -- checksum . extension``
+ ``-s--.``
You now know a great deal more about git-annex and the object tree.
Maybe you are as amazed as we are about some of the ingenuity used behind the scenes.
Even more mesmerizing things about git-annex can be found in its `documentation `_.
+.. index:: ! broken symlink, ! symlink; broken
+.. _wslfiles:
+
Broken symlinks
^^^^^^^^^^^^^^^
-.. index:: ! broken symlink, ! symlink (broken)
-
Whenever a symlink points to a non-existent target, this symlink is called
*broken*, and opening the symlink would not work as it does not resolve. The
-section :ref:`filesystem` will give a thorough demonstration of how symlinks can
+section :ref:`file system` will give a thorough demonstration of how symlinks can
break, and how one can fix them again. Even though *broken* sounds
troublesome, most types of broken symlinks you will encounter can be fixed,
or are not problematic. At this point, you actually have already seen broken
@@ -300,6 +281,17 @@ Alternatively, use the :shcmd:`ls` command in a terminal instead of a file manag
Other tools may be more more specialized, smaller, or domain-specific, and may fail to correctly work with broken symlinks, or display unhelpful error messages when handling them, or require additional flags to modify their behavior.
When encountering unexpected behavior or failures, try to keep in mind that a dataset without retrieved content appears to be a pile of broken symlinks to a range of tools, consult a tools documentation with regard to symlinks, and check whether data retrieval fixes persisting problems.
+A last special case on symlinks exists if you are using DataLad on the Windows Subsystem for Linux.
+If so, please take a look into the Windows Wit below.
+
+.. index::
+ pair: access WSL2 symlinked files; on Windows
+ single: WSL2; symlink access
+ pair: log; Git command
+.. windows-wit:: Accessing symlinked files from your Windows system
+
+ .. include:: topic/wsl2-symlinkaccess.rst
+
Finally, if you are still in the ``books/`` directory, go back into the root of
the superdataset.
@@ -311,30 +303,3 @@ the superdataset.
:cast: 03_git_annex_basics
$ cd ../
-
-
-.. _wslfiles:
-
-Cross-OS filesharing with symlinks (WSL2 only)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Are you using DataLad on the Windows Subsystem for Linux?
-If so, please take a look into the Windows Wit below.
-
-.. windows-wit:: Accessing symlinked files from your Windows system
-
- If you are using WSL2 you have access to a Linux kernel and POSIX filesystem, including symlink support.
- Your DataLad experience has therefore been exactly as it has been for macOS or Linux users.
- But one thing that bears the need for additional information is sharing files in dataset between your Linux and Windows system.
-
- It's fantastic that files created under Linux can be shared to Windows and used by Windows tools.
- Usually, you should be able to open an explorer and type ``\\wsl$\\`` in the address bar to navigate to files under Linux, or type ``explorer.exe`` into the WSL2 terminal.
- Some core limitations of Windows can't be overcome, though: Windows usually isn't capable of handling symlinks.
- So while WSL2 can expose your dataset filled with symlinked files to Windows, your Windows tools can fail to open them.
- How can this be fixed?
-
- Whenever you need to work with files from your datasets under Windows, you should *unlock* with ``datalad unlock``.
- This operation copies the file from the annex back to its original location, and thus removes the symlink (and also returns write :term:`permissions` to the file).
- Alternatively, use `git-annex adjust --unlock `_ to switch to a new dataset :term:`branch` in which all files are unlocked.
- The branch is called ``adjusted/(unlocked)`` (e.g., if the original branch name was ``main``, the new, adjusted branch will be called ``adjusted/main(unlocked)``).
- You can switch back to your original branch using ``git checkout ``.
diff --git a/docs/basics/101-116-sharelocal.rst b/docs/basics/101-116-sharelocal.rst
index 6e1a3e915..acf51fc4f 100644
--- a/docs/basics/101-116-sharelocal.rst
+++ b/docs/basics/101-116-sharelocal.rst
@@ -33,33 +33,34 @@ DataLad for, if everyone can already access everything?" However,
universal, unrestricted access can easily lead to chaos. DataLad can
help facilitate collaboration without requiring ultimate trust and
reliability of all participants. Essentially, with a shared dataset,
-collaborators can look and use your dataset without ever touching it.
+collaborators can see and use your dataset without any danger
+of undesired, or uncontrolled modification.
To demonstrate how to share a DataLad dataset on a common file system,
we will pretend that your personal computer
can be accessed by other users. Let's say that
-your room mate has access, and you're making sure that there is
+your room mate has access, and you are making sure that there is
a ``DataLad-101`` dataset in a different place on the file system
for him to access and work with.
This is indeed a common real-world use case: Two users on a shared
file system sharing a dataset with each other.
-But as we can not easily simulate a second user in this handbook,
+But as we cannot easily simulate a second user in this handbook,
for now, you will have to share your dataset with yourself.
This endeavor serves several purposes: For one, you will experience a very easy
way of sharing a dataset. Secondly, it will show you
-how a dataset can be obtained from a path (instead of a URL as shown in the section
-:ref:`installds`). Thirdly, ``DataLad-101`` is a dataset that can
+how a dataset can be obtained from a path, instead of a URL as shown in section
+:ref:`installds`. Thirdly, ``DataLad-101`` is a dataset that can
showcase many different properties of a dataset already, but it will
be an additional learning experience to see how the different parts
-of the dataset -- text files, larger files, datalad subdataset,
-:dlcmd:`run` commands -- will appear upon installation when shared.
+of the dataset -- text files, larger files, subdatasets,
+:term:`run record`\s -- will appear upon installation when shared.
And lastly, you will likely "share a dataset with yourself" whenever you
will be using a particular dataset of your own creation as input for
one or more projects.
-"Awesome!" exclaims your room mate as you take out your Laptop to
-share the dataset. "You're really saving my ass
+"Awesome!" exclaims your room mate as you take out your laptop to
+share the dataset. "You are really saving my ass
here. I'll make up for it when we prepare for the final", he promises.
To install ``DataLad-101`` into a different part
@@ -81,7 +82,9 @@ home directory. Furthermore, let's for now disregard anything about
:term:`permissions`. In a real-world example you likely would not be able to read and write
to a different user's directories, but we will talk about permissions later.
-.. index:: ! datalad command; clone
+.. index::
+ pair: clone; DataLad command
+ pair: clone dataset (set location description); with DataLad
After creation, navigate into ``mock_user`` and install the dataset ``DataLad-101``.
To do this, use :dlcmd:`clone`, and provide a path to your original
@@ -134,7 +137,8 @@ object tree. To reassure your room mate that everything is fine you
quickly explain to him the concept of a symlink and the :term:`object-tree`
of :term:`git-annex`.
-.. index:: ! datalad command; clone
+.. index::
+ pair: clone; DataLad command
"But why does the PDF not open when I try to open it?" he repeats.
True, these files cannot be opened. This mimics our experience when
@@ -163,113 +167,12 @@ To demonstrate this, you decide to examine the PDFs further.
"Opening this file will work, because the content was retrieved from
the original dataset.", you explain, proud that this worked just as you
-thought it would. Your room mate is excited by this magical
-command. You however begin to wonder: how does DataLad know where to look for
-that original content?
-
-This information comes from git-annex. Before getting the next PDF,
-let's query git-annex where its content is stored:
-
-.. runrecord:: _examples/DL-101-116-105
- :language: console
- :workdir: dl-101/mock_user/DataLad-101
- :notes: git-annex whereis to find out where content is stored
- :cast: 04_collaboration
-
- $ git annex whereis books/TLCL.pdf
-
-Oh, another :term:`shasum`! This time however not in a symlink...
-"That's hard to read -- what is it?" your room mate asks. You can
-recognize a path to the dataset on your computer, prefixed with the user
-and hostname of your computer. "This", you exclaim, excited about your own realization,
-"is my dataset's location I'm sharing it from!"
-
-.. find-out-more:: What is this location, and what if I provided a description?
-
- Back in the very first section of the Basics, :ref:`createDS`, a hidden
- section mentioned the ``--description`` option of :dlcmd:`create`.
- With this option, you can provide a description about the dataset *location*.
-
- The :gitannexcmd:`whereis` command, finally, is where such a description
- can become handy: If you had created the dataset with
-
- .. code-block:: bash
-
- $ datalad create --description "course on DataLad-101 on my private Laptop" -c text2git DataLad-101
-
- the command would show ``course on DataLad-101 on my private Laptop`` after
- the :term:`shasum` -- and thus a more human-readable description of *where*
- file content is stored.
- This becomes especially useful when the number of repository copies
- increases. If you have only one other dataset it may be easy to
- remember what and where it is. But once you have one back-up
- of your dataset on a USB-Stick, one dataset shared with
- Dropbox, and a third one on your institutions
- :term:`GitLab` instance you will be grateful for the descriptions
- you provided these locations with.
-
- The current report of the location of the dataset is in the format
- ``user@host:path``.
-
- If the physical location of a dataset is not relevant, ambiguous, or volatile,
- or if it has an :term:`annex` that could move within the foreseeable lifetime of a
- dataset, a custom description with the relevant information on the dataset is
- superior. If this is not the case, decide for yourself whether you want to use
- the ``--description`` option for future datasets or not depending on what you
- find more readable -- a self-made location description, or an automatic
- ``user@host:path`` information.
-
-
-The message further informs you that there is only "``(1 copy)``"
-of this file content. This makes sense: There
-is only your own, original ``DataLad-101`` dataset in which
-this book is saved.
-
-To retrieve file content of an annexed file such as one of
-these PDFs, git-annex will try
-to obtain it from the locations it knows to contain this content.
-It uses the checksums to identify these locations. Every copy
-of a dataset will get a unique ID with such a checksum.
-Note however that just because git-annex knows a certain location
-where content was once it does not guarantee that retrieval will
-work. If one location is a USB-Stick that is in your bag pack instead
-of your USB port,
-a second location is a hard drive that you deleted all of its
-previous contents (including dataset content) from,
-and another location is a web server, but you are not connected
-to the internet, git-annex will not succeed in retrieving
-contents from these locations.
-As long as there is at least one location that contains
-the file and is accessible, though, git-annex will get the content.
-Therefore, for the books in your dataset, retrieving contents works because you
-and your room mate share the same file system. If you'd share the dataset
-with anyone without access to your file system, ``datalad get`` would not
-work, because it can not access your files.
-
-But there is one book that does not suffer from this restriction:
-The ``bash_guide.pdf``.
-This book was not manually downloaded and saved to the dataset with ``wget``
-(thus keeping DataLad in the dark about where it came from), but it was
-obtained with the :dlcmd:`download-url` command. This registered
-the books original source in the dataset, and here is why that is useful:
-
-.. runrecord:: _examples/DL-101-116-106
- :language: console
- :workdir: dl-101/mock_user/DataLad-101
-
- $ git annex whereis books/bash_guide.pdf
-
-Unlike the ``TLCL.pdf`` book, this book has two sources, and one of them is
-``web``. The second to last line specifies the precise URL you downloaded the
-file from. Thus, for this book, your room mate is always able to obtain it
-(as long as the URL remains valid), even if you would delete your ``DataLad-101``
-dataset. Quite useful, this provenance, right?
+thought it would.
Let's now turn to the fact that the subdataset ``longnow`` contains neither
file content nor file metadata information to explore the contents of the
dataset: there are no subdirectories or any files under ``recordings/longnow/``.
This is behavior that you have not observed until now.
-
To fix this and obtain file availability metadata,
you have to run a somewhat unexpected command:
@@ -281,8 +184,8 @@ you have to run a somewhat unexpected command:
$ datalad get -n recordings/longnow
-The section below will elaborate on :dlcmd:`get` and the
-``-n/--no-data`` option, but for now, let's first see what has changed after
+Before we look further into :dlcmd:`get` and the
+``-n/--no-data`` option, let's first see what has changed after
running the above command (excerpt):
.. runrecord:: _examples/DL-101-116-108
@@ -308,7 +211,7 @@ subdataset again, just as we did in the example above.
But what about the ``-n`` option for :dlcmd:`get`?
Previously, we used :dlcmd:`get` to get file content. However,
-:dlcmd:`get` operate on more than just the level of *files* or *directories*.
+:dlcmd:`get` operates on more than just the level of *files* or *directories*.
Instead, it can also operate on the level of *datasets*. Regardless of whether
it is a single file (such as ``books/TLCL.pdf``) or a registered subdataset
(such as ``recordings/longnow``), :dlcmd:`get` will operate on it to 1) install
@@ -324,13 +227,16 @@ directories, thus limiting its scope to the level of datasets as only a
have retrieved all of the subdatasets contents right away. But with ``-n/--no-data``,
it only installed the subdataset to retrieve the meta data about file availability.
+.. index::
+ pair: get all dataset content; with DataLad
+
To explicitly install all potential subdatasets *recursively*, that is,
all of the subdatasets inside it as well, one can give the
``-r``/``--recursive`` option to :dlcmd:`get`:
-.. code-block:: bash
+.. code-block:: console
- datalad get -n -r
+ $ datalad get -n -r
This would install the ``subds`` subdataset and all potential further
subdatasets inside of it, and the meta data about file hierarchies would
@@ -348,9 +254,9 @@ However, there is a middle way [#f1]_: The ``--recursion-limit`` option let's
you specify how many levels of subdatasets should be installed together
with the first subdataset:
-.. code-block:: bash
+.. code-block:: console
- datalad get -n -r --recursion-limit 1
+ $ datalad get -n -r --recursion-limit 1
To summarize what you learned in this section, write a note on how to
@@ -364,9 +270,9 @@ Write this note in "your own" (the original) ``DataLad-101`` dataset, though!
:notes: note in original DataLad-101 dataset
:cast: 04_collaboration
- # navigate back into the original dataset
+ $ # navigate back into the original dataset
$ cd ../../DataLad-101
- # write the note
+ $ # write the note
$ cat << EOT >> notes.txt
A source to install a dataset from can also be a path, for example as
in "datalad clone ../DataLad-101".
@@ -396,6 +302,8 @@ Save this note.
$ datalad save -m "add note about cloning from paths and recursive datalad get"
+.. index::
+ pair: clone; DataLad concept
.. gitusernote:: Get a clone
A dataset that is installed from an existing source, e.g., a path or URL,
diff --git a/docs/basics/101-117-sharelocal2.rst b/docs/basics/101-117-sharelocal2.rst
index da33e487c..2947edf2b 100644
--- a/docs/basics/101-117-sharelocal2.rst
+++ b/docs/basics/101-117-sharelocal2.rst
@@ -17,88 +17,127 @@ exactly the specified registered subdataset.
And you have mesmerized your room mate by showing him how :term:`git-annex`
retrieved large file contents from the original dataset.
+Your room mate is excited by this magical command.
+You however begin to wonder: how does DataLad know where to look for that original content?
-Let's now see the :gitannexcmd:`whereis` command in more detail,
-and find out how git-annex knows *where* file content can be obtained from.
-Within the original ``DataLad-101`` dataset, you retrieved some of the ``.mp3``
-files via :dlcmd:`get`, but not others. How will this influence the
-output of :gitannexcmd:`whereis`, you wonder?
-
-Together with your room mate, you decide to find out. You navigate
-back into the installed dataset, and run :gitannexcmd:`whereis` on a
-file that you once retrieved file content for, and on a file
-that you did not yet retrieve file content for.
-Here is the output for the retrieved file:
+This information comes from git-annex.
+Before getting another PDF, let's query git-annex where its content is stored:
+.. index::
+ pair: whereis; git-annex command
+ pair: show file content availability; with git-annex
.. runrecord:: _examples/DL-101-117-101
:language: console
:workdir: dl-101/DataLad-101
- :notes: More on how git-annex whereis behaves
+ :notes: git-annex whereis to find out where content is stored
:cast: 04_collaboration
- # navigate back into the clone of DataLad-101
+ $ # navigate back into the clone of DataLad-101
$ cd ../mock_user/DataLad-101
- # navigate into the subdirectory
- $ cd recordings/longnow
- # file content exists in original DataLad-101 for this file
- $ git annex whereis Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3
-
-And here is the output for a file that you did not yet retrieve
-content for in your original ``DataLad-101`` dataset.
+ $ git annex whereis books/TLCL.pdf
+
+Oh, another cryptic character sequence - this time however not a symlink, but an :term:`annex UUID`.
+"That's hard to read -- what is it?" your room mate asks.
+You can recognize a path to the dataset on your computer, prefixed with the user and hostname of your computer.
+"This", you exclaim, excited about your own realization, "is my dataset's location I'm sharing it from!"
+
+.. index::
+ pair: set description for dataset location; with DataLad
+.. find-out-more:: What is this location, and what if I provided a description?
+
+ Back in the very first section of the Basics, :ref:`createDS`, a :ref:`Find-out-more mentioned the '--description' option ` of :dlcmd:`create`.
+ With this option, you can provide a description about the dataset *location*.
+
+ The :gitannexcmd:`whereis` command, finally, is where such a description
+ can become handy: If you had created the dataset with
+
+ .. code-block:: console
+
+ $ datalad create --description "course on DataLad-101 on my private laptop" -c text2git DataLad-101
+
+ the command would show ``course on DataLad-101 on my private laptop`` after
+ the :term:`UUID` -- and thus a more human-readable description of *where*
+ file content is stored.
+ This becomes especially useful when the number of repository copies
+ increases. If you have only one other dataset it may be easy to
+ remember what and where it is. But once you have one back-up
+ of your dataset on a USB stick, one dataset shared with
+ Dropbox, and a third one on your institutions
+ :term:`GitLab` instance you will be grateful for the descriptions
+ you provided these locations with.
+
+ The current report of the location of the dataset is in the format
+ ``user@host:path``.
+
+ If the physical location of a dataset is not relevant, ambiguous, or volatile,
+ or if it has an :term:`annex` that could move within the foreseeable lifetime of a
+ dataset, a custom description with the relevant information on the dataset is
+ superior. If this is not the case, decide for yourself whether you want to use
+ the ``--description`` option for future datasets or not depending on what you
+ find more readable -- a self-made location description, or an automatic
+ ``user@host:path`` information.
+
+
+The message further informs you that there is only "``(1 copy)``" of this file content.
+This makes sense: There is only your own, original ``DataLad-101`` dataset in which this book is saved.
+
+To retrieve file content of an annexed file such as one of these PDFs, git-annex will try to obtain it from the locations it knows to contain this content.
+It uses the UUID to identify these locations.
+Every copy of a dataset will get a UUID as a unique identifier.
+Note however that just because git-annex knows a certain location where content was once it does not guarantee that retrieval will work.
+If one location is a USB stick that is in your bag pack instead of your USB port, a second location is a hard drive that you deleted all of its previous contents (including dataset content) from,
+and another location is a web server, but you are not connected to the internet, git-annex will not succeed in retrieving contents from these locations.
+As long as there is at least one location that contains the file and is accessible, though, git-annex will get the content.
+Therefore, for the books in your dataset, retrieving contents works because you and your room mate share the same file system.
+If you'd share the dataset with anyone without access to your file system, ``datalad get`` would not work, because it cannot access your files.
+
+But there is one book that does not suffer from this restriction:
+The ``bash_guide.pdf``.
+This book was not manually downloaded and saved to the dataset with ``wget`` (thus keeping DataLad in the dark about where it came from), but it was obtained with the :dlcmd:`download-url` command.
+This registered the books original source in the dataset, and here is why that is useful:
.. runrecord:: _examples/DL-101-117-102
:language: console
- :workdir: dl-101/mock_user/DataLad-101/recordings/longnow
- :cast: 04_collaboration
+ :workdir: dl-101/mock_user/DataLad-101
- # but not for this:
- $ git annex whereis Long_Now__Seminars_About_Long_term_Thinking/2005_01_15__James_Carse__Religious_War_In_Light_of_the_Infinite_Game.mp3
+ $ git annex whereis books/bash_guide.pdf
-As you can see, the file content previously downloaded with a
-:dlcmd:`get` has a third source, your original dataset on your computer.
-The file we did not yet retrieve in the original dataset
-only has only two sources.
+Unlike the ``TLCL.pdf`` book, this book has two sources, and one of them is ``web``.
+The second to last line specifies the precise URL you downloaded the file from.
+Thus, for this book, your room mate is always able to obtain it (as long as the URL remains valid), even if you would delete your ``DataLad-101`` dataset.
-Let's see how this affects a :dlcmd:`get`:
+We can also see a report of the source that git-annex uses to retrieve the content from if we look at the very end of the ``get`` summary.
.. runrecord:: _examples/DL-101-117-103
:language: console
- :workdir: dl-101/mock_user/DataLad-101/recordings/longnow
- :notes: Get a file that is present in original and one that is not
- :cast: 04_collaboration
+ :workdir: dl-101/mock_user/DataLad-101
- # get the first file
- $ datalad get Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3
+ $ datalad get books/TLCL.pdf
+ $ datalad get books/bash_guide.pdf
+Both of these files were retrieved "``from origin...``".
+``Origin`` is Git terminology for "from where the dataset was copied from" -- ``origin`` therefore is the original ``DataLad-101`` dataset from which file content can be retrieved from very fast.
+
+If your roommate did not have access to the same file system or you deleted your ``DataLad-101`` dataset, this output would look differently.
+The ``datalad get`` command would fail on the ``TLCL.pdf`` book without a known second source, and ``bash_guide.pdf`` would be retrieved "``from web...``" - the registered second source, its original download URL.
+Let's see a retrieval from ``web`` in action for another file.
+The ``.mp3`` files in the ``longnow`` seminar series have registered web URLs [#f1]_.
.. runrecord:: _examples/DL-101-117-104
:language: console
- :workdir: dl-101/mock_user/DataLad-101/recordings/longnow
+ :workdir: dl-101/mock_user/DataLad-101
+ :notes: More on how git-annex whereis behaves
:cast: 04_collaboration
- # get the second file
- $ datalad get Long_Now__Seminars_About_Long_term_Thinking/2005_01_15__James_Carse__Religious_War_In_Light_of_the_Infinite_Game.mp3
-
-
-The most important thing to note is: It worked in both cases, regardless of whether the original
-``DataLad-101`` dataset contained the file content or not.
-
-We can see that git-annex used two different sources to retrieve the content from,
-though, if we look at the very end of the ``get`` summary.
-The first file was retrieved "``from origin...``". ``Origin`` is Git terminology
-for "from where the dataset was copied from" -- ``origin`` therefore is the
-original ``DataLad-101`` dataset.
-
-The second file was retrieved "``from web...``", and thus from a different source.
-This source is called ``web`` because it actually is a URL through which this particular
-podcast-episode is made available in the first place. You might also have noticed that the
-download from web took longer than the retrieval from the directory on the same
-file system. But we will get into the details
-of this type of content source
-once we cover the ``importfeed`` and ``add-url`` functions [#f1]_.
+ $ # navigate into the subdirectory
+ $ cd recordings/longnow
+ $ git annex whereis Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3
+ $ datalad get Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3
-Let's for now add a note on the :gitannexcmd:`whereis` command. Again, do
-this in the original ``DataLad-101`` directory, and do not forget to save it.
+As you can see at the end of the ``get`` result, the files has been retrieved "``from web...``".
+Quite useful, this provenance, right?
+Let's add a note on the :gitannexcmd:`whereis` command.
+Again, do this in the original ``DataLad-101`` directory, and do not forget to save it.
.. runrecord:: _examples/DL-101-117-105
:language: console
@@ -106,10 +145,10 @@ this in the original ``DataLad-101`` directory, and do not forget to save it.
:notes: a note in original dataset
:cast: 04_collaboration
- # navigate back:
+ $ # navigate back:
$ cd ../../../../DataLad-101
- # write the note
+ $ # write the note
$ cat << EOT >> notes.txt
The command "git annex whereis PATH" lists the repositories that have
the file content of an annexed file. When using "datalad get" to
@@ -148,5 +187,5 @@ this in the original ``DataLad-101`` directory, and do not forget to save it.
.. [#f1] Maybe you wonder what the location ``mih@medusa`` is. It is a copy of the
data on an account belonging to user ``mih`` on the host name ``medusa``.
Because we do not have the host names' address, nor log-in credentials for
- this user, we can not retrieve content from this location. However, somebody
- else (for example the user ``mih``) could.
+ this user, we cannot retrieve content from this location. However, somebody
+ else (for example, the user ``mih``) could.
diff --git a/docs/basics/101-118-sharelocal3.rst b/docs/basics/101-118-sharelocal3.rst
index e3da1bb14..67c9cd5f4 100644
--- a/docs/basics/101-118-sharelocal3.rst
+++ b/docs/basics/101-118-sharelocal3.rst
@@ -45,7 +45,7 @@ want to run by taking a look into the history of the dataset
:notes: More cool things on shared datasets: rerunning run commands
:cast: 04_collaboration
- # navigate into the shared copy
+ $ # navigate into the shared copy
$ cd ../mock_user/DataLad-101
.. runrecord:: _examples/DL-101-118-102
@@ -55,7 +55,7 @@ want to run by taking a look into the history of the dataset
:notes: find the shasum
:cast: 04_collaboration
- # lets view the history
+ $ # lets view the history
$ git log --oneline -n 10
Ah, there it is, the second most recent commit.
@@ -74,8 +74,7 @@ command:
file content from the subdataset and it tried to unlock the output
prior to the command execution. Note that because you did not retrieve
the output, ``recordings/salt_logo_small.jpg``, yet, the missing content
-could not be unlocked. DataLad warns you about this, but proceeds
-successfully.
+could not be "unlocked", but is reportedly "removed" prior to the successful rerun.
Your room mate now not only knows how exactly the resized file
came into existence, but he can also reproduce your exact steps to
diff --git a/docs/basics/101-119-sharelocal4.rst b/docs/basics/101-119-sharelocal4.rst
index a04ae4153..42980b557 100644
--- a/docs/basics/101-119-sharelocal4.rst
+++ b/docs/basics/101-119-sharelocal4.rst
@@ -9,13 +9,13 @@ All of what you have seen about sharing dataset was really
cool, and for the most part also surprisingly intuitive.
:dlcmd:`run` commands or file retrieval worked exactly as
you imagined it to work, and you begin to think that slowly but
-steadily you're getting a feel about how DataLad really works.
+steadily you are getting a feel about how DataLad really works.
But to be honest, so far, sharing the dataset with DataLad was
also remarkably unexciting given that you already knew most of
the dataset magic that your room mate currently is still
mesmerized about.
-To be honest, you're not yet certain whether
+To be honest, you are not yet certain whether
sharing data with DataLad really improves your life up
until this point. After all, you could have just copied
your directory into your ``mock_user`` directory and
@@ -36,7 +36,7 @@ installation in ``../mock_user/DataLad-101``:
:notes: On updating dataset. How do we get the updated notes from the original dataset?
:cast: 04_collaboration
- # Inside the installed copy, view the last 15 lines of notes.txt
+ $ # Inside the installed copy, view the last 15 lines of notes.txt
$ tail notes.txt
But the original intention of sharing the dataset with
@@ -50,7 +50,9 @@ it can query the original dataset whether any changes
happened since the last time it checked, and if so, retrieve and
integrate them.
-.. index:: ! datalad command; update
+.. index::
+ pair: update; DataLad command
+ pair: update dataset with remote change; with DataLad
This is done with the :dlcmd:`update --how merge`
command.
@@ -80,7 +82,7 @@ the previously missing changes are now present:
:notes: let's check whether the updates are there
:cast: 04_collaboration
- # view the last 15 lines of notes.txt
+ $ # view the last 15 lines of notes.txt
$ tail notes.txt
Wohoo, the contents are here!
@@ -99,10 +101,10 @@ dataset to your own ``DataLad-101`` dataset:
:notes: note in original ds
:cast: 04_collaboration
- # navigate back:
+ $ # navigate back:
$ cd ../../DataLad-101
- # write the note
+ $ # write the note
$ cat << EOT >> notes.txt
To update a shared dataset, run the command "datalad update --how merge".
This command will query its origin for changes, and integrate the
@@ -116,17 +118,17 @@ dataset to your own ``DataLad-101`` dataset:
:notes:
:cast: 04_collaboration
- # save the changes
-
+ $ # save the changes
$ datalad save -m "add note about datalad update"
-PS: You might wonder whether there is also a sole
-:dlcmd:`update` command. Yes, there is -- if you are
-a Git-user and know about branches and merging you can read the
-``Note for Git-users`` below. However, a thorough explanation
+PS: You might wonder what a plain :dlcmd:`update` command with no option does.
+If you are a Git-user and know about branches and merging you can read the
+``Note for Git-users``. However, a thorough explanation
and demonstration will be in the next section.
+.. index::
+ pair: update; DataLad concept
.. gitusernote:: Update internals
:dlcmd:`update` is the DataLad equivalent of a :gitcmd:`fetch`,
diff --git a/docs/basics/101-120-summary.rst b/docs/basics/101-120-summary.rst
index a9db99f88..1543f1101 100644
--- a/docs/basics/101-120-summary.rst
+++ b/docs/basics/101-120-summary.rst
@@ -10,8 +10,11 @@ sharing a dataset with a simple example.
* To obtain a dataset, one can also use :dlcmd:`clone` with a path.
Potential subdatasets will not be installed right away. As they are registered in
- the superdataset, you can do :dlcmd:`get -n/--no-data`,
- or specify the ``-r``/``--recursive`` (``datalad get -n -r ``)
+ the superdataset, you can
+
+ - do ``datalad get -n/--no-data``
+ - or specify the ``-r``/``--recursive``: ``datalad get -n -r ``
+
with a decent ``-R/--recursion-limit`` choice to install them afterwards.
* The configuration of the original dataset determines which types
@@ -29,11 +32,10 @@ sharing a dataset with a simple example.
file content sources.
* :gitannexcmd:`whereis PATH` will list all locations known to contain file
- content for a particular file. This location is where :term:`git-annex`
- will attempt to retrieve file content from, and it is described with the
- ``--description`` provided during a :dlcmd:`create`. It is a very
+ content for a particular file. It is a very
helpful command to find out where file content resides, and how many
- locations with copies exist.
+ locations with copies exist. :term:`git-annex` will try to retrieve file contents from those locations. If you want, you can describe locations with the
+ ``--description`` provided during a :dlcmd:`create`.
* A shared copy of a dataset includes the datasets history. If well made,
:dlcmd:`run` commands can then easily be ``rerun``.
@@ -55,7 +57,7 @@ sharing a dataset with a simple example.
:dlcmd:`diff` and :gitcmd:`diff` can subsequently help to find
out what changes have been made in the sibling.
-Now what I can do with that?
+Now what can I do with that?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Most importantly, you have experienced the first way of sharing
@@ -65,8 +67,8 @@ the book you will see examples in which datasets are shared on the same
file system in surprisingly useful ways.
Simultaneously, you have observed dataset properties you already knew
-(for example how annexed files need to be retrieved via :dlcmd:`get`),
-but you have also seen novel aspects of a dataset -- for example that
+(for example, how annexed files need to be retrieved via :dlcmd:`get`),
+but you have also seen novel aspects of a dataset -- for example, that
subdatasets are not automatically installed by default, how
:gitannexcmd:`whereis` can help you find out where file content might be stored,
how useful commands that capture provenance about the origin or creation of files
diff --git a/docs/basics/101-121-siblings.rst b/docs/basics/101-121-siblings.rst
index 04713bef4..2d2cf66e8 100644
--- a/docs/basics/101-121-siblings.rst
+++ b/docs/basics/101-121-siblings.rst
@@ -13,12 +13,15 @@ or changes you make to your dataset, and stay up to date.
This is because a DataLad dataset makes updating shared
data a matter of a single :dlcmd:`update --how merge` command.
-But why does this need to be a one-way line? "I want to
+But why does this need to be a one-way street? "I want to
provide helpful information for you as well!", says your
room mate. "How could you get any insightful notes that
I make in my dataset, or maybe the results of our upcoming
mid-term project? Its a bit unfair that I can get your work,
-but you can not get mine."
+but you cannot get mine."
+
+.. index::
+ pair: register file with URL in dataset; with DataLad
Consider, for example, that your room mate might have googled about DataLad
a bit. In the depths of the web, he might have found useful additional information, such
@@ -26,7 +29,7 @@ a script on `dataset nesting `_
configurations than the ones in this config file, but
they are related to Git, and less related or important to the configuration of
a DataLad dataset. We will use this section to showcase the anatomy of the
-:gitcmd:`config` command. If for example you would want to specifically
+:gitcmd:`config` command. If, for example, you would want to specifically
configure :term:`nano` to be the default editor in this dataset, you
can do it like this:
@@ -170,7 +174,7 @@ Let's see what has changed:
$ cat .git/config
-With this additional line in your repositories Git configuration, nano will
+With this additional line in your repository's Git configuration, ``nano`` will
be used as a default editor regardless of the configuration in your global
or system-wide configuration. Note that the flag ``--local`` applies the
configuration to your repository's ``.git/config`` file, whereas ``--global``
@@ -179,9 +183,9 @@ system-wide configuration.
If you would want to change this existing line in your ``.git/config``
file, you would replace ``--add`` with ``--replace-all`` such as in:
-.. code-block:: bash
+.. code-block:: console
- git config --local --replace-all core.editor "vim"
+ $ git config --local --replace-all core.editor "vim"
to configure :term:`vim` to be your default editor.
Note that while being a good toy example, it is not a common thing to
@@ -195,10 +199,12 @@ a value, one can configure Git, git-annex, and DataLad.
of Git, depending on the scope (local, global, system-wide)
specified in the command.
+.. index::
+ pair: unset configuration; with Git
.. find-out-more:: If things go wrong during Git config
If something goes wrong during the :gitcmd:`config` command,
- for example you end up having two keys of the same name because you
+ for example, you end up having two keys of the same name because you
added a key instead of replacing an existing one, you can use the
``--unset`` option to remove the line. Alternatively, you can also open
the config file in an editor and remove or change sections by hand.
@@ -242,6 +248,7 @@ to de-mystify the :gitcmd:`config` command and the configuration files.
Nevertheless, it might be helpful to get an overview about the meaning of the
remaining sections in that file, and the :ref:`that dissects this config file further ` can give you a glimpse of this.
+.. index:: dataset configuration
.. find-out-more:: Dissecting a Git config file further
:name: fom_gitconfig
:float:
@@ -249,8 +256,7 @@ remaining sections in that file, and the :ref:`that dissects this config file fu
Let's walk through the Git config file of ``DataLad-101``:
As mentioned above, git-annex will use the
:term:`Git config file` for some of its configurations, such as the second section.
- It lists the repository version and git-annex
- UUID [#f4]_ (:gitannexcmd:`whereis` displays information about where the
+ It lists the repository version and :term:`annex UUID` [#f4]_ (:gitannexcmd:`whereis` displays information about where the
annexed content is with these UUIDs).
You may recognize the fourth part of the configuration, the subsection
@@ -283,12 +289,27 @@ remaining sections in that file, and the :ref:`that dissects this config file fu
The value to the ``url`` variable is a *path*. If at any point
either your superdataset or the remote moves on your file system,
the association between the two datasets breaks -- this can be fixed by adjusting this
- path, and a demonstration of this is in section :ref:`filesystem`.
+ path, and a demonstration of this is in section :ref:`file system`.
`fetch` contains a specification which parts of the repository are
updated -- in this case everything (all of the branches).
Lastly, the ``annex-ignore = false`` configuration allows git-annex
to query the remote when it tries to retrieve data from annexed content.
+.. index::
+ pair: configuration; DataLad command
+ pair: set configuration; with DataLad
+
+The ``datalad configuration`` command
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although this section put a focus on the ``git config`` command, it is important to mention that there also is a :dlcmd:`configuration` command.
+It is not identical to ``git config``, but while it lacks some feature of ``git config``, such as the ability to set system-wide configuration, it has additional features.
+Beyond the ``local`` and ``global`` scopes, it also supports :term:`branch` specific configurations in the ``.datalad/config`` file (further discussed in the next section), setting configurations recursively through dataset hierarchies, and multi-configuration queries (such as ``datalad configuration get user.name user.email``).
+By default, ``datalad configuration`` will ``dump`` (list) the effective configuration including relevant ``DATALAD_*`` :term:`environment variable`\s, and also annotate the purpose of many common configuration items.
+The subcommands ``datalad configuration get`` or ``datalad configuration set`` perform queries or set configurations.
+You can find out more information on this command in the command documentation.
+
+
``.git/config`` versus other (configuration) files
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -319,8 +340,8 @@ The next section will talk about them.
.. rubric:: Footnotes
.. [#f1] As an alternative to a ``git config`` command, you could also run configuration
- templates or procedures (see :ref:`procedures`) that apply predefined configurations or in some cases even
- add the information to the configuration file by hand and save it using an editor of your choice.
+ templates or procedures that apply predefined configurations or in some cases even
+ add the information to the configuration file by hand and save it using an editor of your choice. See :ref:`procedures` for more info.
.. [#f2] The third scope of a Git configuration are the system wide configurations.
These are stored (if they exist) in ``/etc/gitconfig`` and contain settings that would
@@ -334,14 +355,14 @@ The next section will talk about them.
1) Open up the file with an editor for your choice (e.g., `nano `_), and either paste the following configuration or edit it if it already exists:
- .. code-block:: bash
+ .. code-block:: ini
[core]
editor = nano
2) Run the following command, but exchange ``nano`` with an editor of your choice:
- .. code-block:: bash
+ .. code-block:: ini
$ git config --global --add core.editor "nano"
@@ -349,6 +370,6 @@ The next section will talk about them.
that unambiguously identifies information.
.. [#f5] Please note that not all configurations can be written to files other than ``.git/config``.
- Some of the files introduced in the next section will not be queried by Git, and in principle, it is a good thing that one can not share arbitrary configurations together with a dataset, as this could be a potential security threat.
+ Some of the files introduced in the next section will not be queried by Git, and in principle, it is a good thing that one cannot share arbitrary configurations together with a dataset, as this could be a potential security threat.
In those cases where you need dataset clones to inherit certain non-sticky configurations, it is advised to write a custom procedure and distribute it together with the dataset.
The next two sections contain concrete usecases and tutorials.
diff --git a/docs/basics/101-123-config2.rst b/docs/basics/101-123-config2.rst
index 3ece377d5..fb5fb7907 100644
--- a/docs/basics/101-123-config2.rst
+++ b/docs/basics/101-123-config2.rst
@@ -12,16 +12,16 @@ All of these files store configurations, but have an important difference:
They are version controlled, and upon sharing a dataset these configurations
will be shared as well. An example for a shared configuration
is the one that the ``text2git`` configuration template applied:
-In the shared copy of your dataset, text files are also saved with Git,
-and not git-annex (see section :ref:`sibling`). The configuration responsible
+In the shared copy of your dataset from :ref:`sibling`, text files are also saved with Git,
+and not git-annex. The configuration responsible
for this behavior is in a ``.gitattributes`` file, and we'll start this
section by looking into it.
+.. index:: ! configuration file; .gitattributes
+
``.gitattributes``
^^^^^^^^^^^^^^^^^^
-.. index:: ! Config files; .gitattributes
-
This file lies right in the root of your superdataset:
.. runrecord:: _examples/DL-101-123-101
@@ -34,11 +34,10 @@ This looks neither spectacular nor pretty. Also, it does not follow the ``sectio
organization of the ``.git/config`` file anymore. Instead, there are three lines,
and all of these seem to have something to do with the configuration of git-annex.
There even is one key word that you recognize: MD5E.
-If you have read the hidden section in :ref:`symlink`
+If you have read the :ref:`Find-out-more on object trees `
you will recognize it as a reference to the type of
key used by git-annex to identify and store file content in the object-tree.
-The first row, ``* annex.backend=MD5E``, therefore translates to "Everything in this
-directory should be hashed with a MD5E hash function".
+The first row, ``* annex.backend=MD5E``, therefore translates to "The ``MD5E`` git-annex backend should be used for any file".
But what is the rest? We'll start with the last row:
.. code-block:: bash
@@ -66,17 +65,14 @@ configured git-annex to regard all files of type "binary" as a large file.
Thanks to this little line, your text files are not annexed, but stored
directly in Git.
-The patterns ``*`` and ``**`` are so-called "wildcards" used in :term:`globbing`.
-``*`` matches any file or directory in the current directory, and ``**`` matches
-all files and directories in the current directory *and subdirectories*. In technical
-terms, ``**`` matches *recursively*. The third row therefore
-translates to "Do not annex anything that is a text file in this directory" for git-annex.
-
-However, rules can be even simpler. The second row simply takes a complete directory
-(``.git``) and instructs git-annex to regard nothing in it as a "large file".
-The second row, ``**/.git* annex.largefiles=nothing`` means that no
-``.git`` repository in this directory or a subdirectory should be considered
-a "large file". This way, the ``.git`` repositories are protected from being annexed.
+The patterns ``*`` and ``**`` are so-called "wildcards" you might recognize from used in :term:`globbing`.
+In Git configuration files, an asterisk "*" matches anything except a slash.
+The third row therefore
+translates to "Do not annex anything that is a text file" for git-annex.
+Two leading "``**``" followed by a slash matches
+*recursively* in all directories.
+Therefore, the second row instructs git-annex to regard nothing starting with ``.git`` as a "large file", including contents inside of ``.git`` directories.
+This way, the ``.git`` repositories are protected from being annexed.
If you had a single file (``myfile.pdf``) you would not want annexed, specifying a rule such as:
.. code-block:: bash
@@ -133,11 +129,11 @@ Later however you will see preconfigured DataLad *procedures* such as ``text2git
can apply useful configurations for you, just as ``text2git`` added the last line
in the root ``.gitattributes`` file.
+.. index:: ! configuration file; .gitmodules
+
``.gitmodules``
^^^^^^^^^^^^^^^
-.. index:: ! Config files; .gitmodules
-
On last configuration file that Git creates is the ``.gitmodules`` file.
There is one right in the root of your dataset:
@@ -148,13 +144,14 @@ There is one right in the root of your dataset:
$ cat .gitmodules
Based on these contents, you might have already guessed what this file
-stores. ``.gitmodules`` is a configuration file that stores the mapping between
+stores. The ``.gitmodules`` file is a configuration file that stores the mapping between
your own dataset and any subdatasets you have installed in it.
There will be an entry for each submodule (subdataset) in your dataset.
The name *submodule* is Git terminology, and describes a Git repository inside of
another Git repository, i.e., the super- and subdataset principles.
Upon sharing your dataset, the information about subdatasets and where to retrieve
them from is stored and shared with this file.
+In addition to modifying it with the ``git config`` command or by hand, the ``datalad subdatasets`` command also has a ``--set-property NAME VALUE`` option that you can use to set subdataset properties.
Section :ref:`sharelocal1` already mentioned one additional configuration option in a footnote: The ``datalad-recursiveinstall`` key.
This key is defined on a per subdataset basis, and if set to "``skip``", the given subdataset will not be recursively installed unless it is explicitly specified as a path to :dlcmd:`get [-n/--no-data] -r`.
@@ -163,81 +160,49 @@ Below is a minimally functional example on how to apply the configuration and ho
Let's create a dataset hierarchy to work with (note that we concatenate multiple commands into a single line using bash's "and" ``&&`` operator):
-.. code-block:: bash
+.. code-block:: console
- # create a superdataset with two subdatasets
- $ datalad create superds && cd superds && datalad create -d . subds1 && datalad create -d . subds2
- [INFO ] Creating a new annex repo at /tmp/superds
+ $ # create a superdataset with two subdatasets
+ $ datalad create superds && datalad -C superds create -d . subds1 && datalad -C superds create -d . subds2
create(ok): /tmp/superds (dataset)
- [INFO ] Creating a new annex repo at /tmp/superds/subds1
add(ok): subds1 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subds1 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
- [INFO ] Creating a new annex repo at /tmp/superds/subds2
add(ok): subds2 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subds2 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
Next, we create subdatasets in the subdatasets:
-.. code-block:: bash
+.. code-block:: console
- # create two subdatasets in subds1
- $ cd subds1 && datalad create -d . subsubds1 && datalad create -d . subsubds2 && cd ../
- [INFO ] Creating a new annex repo at /tmp/superds/subds1/subsubds1
+ $ # create two subdatasets in subds1
+ $ datalad -C superds/subds1 create -d . subsubds1 && datalad -C superds/subds1 create -d . subsubds2
add(ok): subsubds1 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subsubds1 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
- [INFO ] Creating a new annex repo at /tmp/superds/subds1/subsubds2
add(ok): subsubds2 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subsubds2 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
-
- # create two subdatasets in subds2
- $ cd subds2 && datalad create -d . subsubds1 && datalad create -d . subsubds2
- [INFO ] Creating a new annex repo at /tmp/superds/subds2/subsubds1
+ $ # create two subdatasets in subds2
+ $ datalad -C superds/subds2 create -d . subsubds1 && datalad -C superds/subds2 create -d . subsubds2
add(ok): subsubds1 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subsubds1 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
- [INFO ] Creating a new annex repo at /tmp/superds/subds2/subsubds2
add(ok): subsubds2 (file)
add(ok): .gitmodules (file)
save(ok): . (dataset)
create(ok): subsubds2 (dataset)
- action summary:
- add (ok: 2)
- create (ok: 1)
- save (ok: 1)
Here is the directory structure:
-.. code-block:: bash
+.. code-block:: console
$ cd ../ && tree
.
@@ -248,51 +213,40 @@ Here is the directory structure:
├── subsubds1
└── subsubds2
- # save in the superdataset
+ $ # save in the superdataset
datalad save -m "add a few sub and subsub datasets"
add(ok): subds1 (file)
add(ok): subds2 (file)
save(ok): . (dataset)
- action summary:
- add (ok: 2)
- save (ok: 1)
-Now, we can apply the ``datalad-recursiveinstall`` configuration to skip recursive installations for subds1
+Now, we can apply the ``datalad-recursiveinstall`` configuration to skip recursive installations for ``subds1``
-.. code-block:: bash
+.. code-block:: console
$ git config -f .gitmodules --add submodule.subds1.datalad-recursiveinstall skip
- # save this configuration
+ $ # save this configuration
$ datalad save -m "prevent recursion into subds1, unless explicitly given as path"
add(ok): .gitmodules (file)
save(ok): . (dataset)
- action summary:
- add (ok: 1)
- save (ok: 1)
+
If the dataset is cloned, and someone runs a recursive :dlcmd:`get`, the subdatasets of ``subds1`` will not be installed, the subdatasets of ``subds2``, however, will be.
-.. code-block:: bash
+.. code-block:: console
- # clone the dataset somewhere else
+ $ # clone the dataset somewhere else
$ cd ../ && datalad clone superds clone_of_superds
[INFO ] Cloning superds into '/tmp/clone_of_superds'
install(ok): /tmp/clone_of_superds (dataset)
- # recursively get all contents (without data)
+ $ # recursively get all contents (without data)
$ cd clone_of_superds && datalad get -n -r .
- [INFO ] Installing underneath /tmp/clone_of_superds recursively
- [INFO ] Cloning /tmp/superds/subds2 into '/tmp/clone_of_superds/subds2'
get(ok): /tmp/clone_of_superds/subds2 (dataset)
- [INFO ] Cloning /tmp/superds/subds2/subsubds1 into '/tmp/clone_of_superds/subds2/subsubds1'
get(ok): /tmp/clone_of_superds/subds2/subsubds1 (dataset)
- [INFO ] Cloning /tmp/superds/subds2/subsubds2 into '/tmp/clone_of_superds/subds2/subsubds2'
get(ok): /tmp/clone_of_superds/subds2/subsubds2 (dataset)
- action summary:
- get (ok: 3)
- # only subsubds of subds2 are installed, not of subds1:
+ $ # only subsubds of subds2 are installed, not of subds1:
$ tree
.
├── subds1
@@ -304,12 +258,10 @@ If the dataset is cloned, and someone runs a recursive :dlcmd:`get`, the subdata
Nevertheless, if ``subds1`` is provided with an explicit path, its subdataset ``subsubds`` will be cloned, essentially overriding the configuration:
-.. code-block:: bash
+.. code-block:: console
$ datalad get -n -r subds1 && tree
- [INFO ] Cloning /tmp/superds/subds1 into '/tmp/clone_of_superds/subds1'
install(ok): /tmp/clone_of_superds/subds1 (dataset) [Installed subdataset in order to get /tmp/clone_of_superds/subds1]
- [INFO ] Installing underneath /tmp/clone_of_superds/subds1 recursively
.
├── subds1
│ ├── subsubds1
@@ -321,13 +273,11 @@ Nevertheless, if ``subds1`` is provided with an explicit path, its subdataset ``
6 directories, 0 files
-
+.. index:: ! configuration file; .datalad/config
``.datalad/config``
^^^^^^^^^^^^^^^^^^^
-.. index:: ! Config files; .datalad/config
-
DataLad adds a repository-specific configuration file as well.
It can be found in the ``.datalad`` directory, and just like ``.gitattributes``
and ``.gitmodules`` it is version controlled and is thus shared together with
@@ -351,6 +301,9 @@ Otherwise, dataset updates with :dlcmd:`update` (or, for Git-users,
behavior that was specifically configured.
Also, :term:`Git` and :term:`git-annex` will not query this file for configurations, so please store only sticky options that are specific to DataLad (i.e., under the ``datalad.*`` namespace) in it.
+.. index::
+ pair: modify configuration; with Git
+
Writing to configuration files other than ``.git/config``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -362,7 +315,7 @@ files, and I do not know with which command I can write into these files."
it's also the :gitcmd:`config` command. The only part of it you need to
adjust is the ``-f``, ``--file`` parameter. By default, the command writes to
a Git config file. But it can write to a different file if you specify it
-appropriately. For example
+appropriately. For example,
``git config --file=.gitmodules --replace-all submodule."name".url "new URL"``
@@ -409,20 +362,19 @@ unstaged modification.
$ git checkout .gitmodules
$ datalad status
-Note, though, that the ``.gitattributes`` file can not be modified with a :gitcmd:`config`
+Note, though, that the ``.gitattributes`` file cannot be modified with a :gitcmd:`config`
command. This is due to its different format that does not comply to the
``section.variable.value`` structure of all other configuration files. This file, therefore,
has to be edited by hand, with an editor of your choice.
+.. index:: ! environment variable
.. _envvars:
Environment variables
^^^^^^^^^^^^^^^^^^^^^
-.. index:: ! environment variable
-
An :term:`environment variable` is a variable set up in your shell
-that affects the way the shell or certain software works -- for example
+that affects the way the shell or certain software works -- for example,
the environment variables ``HOME``, ``PWD``, or ``PATH``.
Configuration options that determine the behavior of Git, git-annex, and
DataLad that could be defined in a configuration file can also be set (or overridden)
@@ -435,6 +387,8 @@ configuration of Git can be overridden by its associated environment variable,
``GIT_AUTHOR_NAME``. Likewise, one can define the environment variable instead
of setting the ``user.name`` configuration in a configuration file.
+.. index:: configuration item; datalad.log.level
+
Git, git-annex, and DataLad have more environment variables than anyone would want to
remember. `The ProGit book `__
has a good overview on Git's most useful available environment variables for a start.
@@ -445,12 +399,13 @@ replacing any ``__`` (two underscores) with a hyphen, then any ``_`` (single und
with a dot, and finally converting all letters to lower case. The ``datalad.log.level``
configuration option thus is the environment variable ``DATALAD_LOG_LEVEL``.
+.. index:: operating system concept; environment variable
.. find-out-more:: Some more general information on environment variables
:name: fom-envvar
Names of environment variables are often all-uppercase. While the ``$`` is not part of
the name of the environment variable, it is necessary to *refer* to the environment
- variable: To reference the value of the environment variable ``HOME`` for example you would
+ variable: To reference the value of the environment variable ``HOME``, for example, you would
need to use ``echo $HOME`` and not ``echo HOME``. However, environment variables are
set without a leading ``$``. There are several ways to set an environment variable
(note that there are no spaces before and after the ``=`` !), leading to different
@@ -475,7 +430,7 @@ This has been an intense lecture, you have to admit. One definite
take-away from it has been that you now know a second reason why the hidden
``.git`` and ``.datalad`` directory contents and also the contents of ``.gitmodules`` and
``.gitattributes`` should not be carelessly tampered with -- they contain all of
-the repositories configurations.
+the repository's configurations.
But you now also know how to modify these configurations with enough
care and background knowledge such that nothing should go wrong once you
@@ -486,7 +441,7 @@ you already know some core Git configurations such as name, email, and editor. E
important, you know how to configure git-annex's content management based on ``largefile``
rules, and you understand the variables within ``.gitmodules`` or the sections
in ``.git/config``. Slowly, you realize with pride,
-you're more and more becoming a DataLad power-user.
+you are more and more becoming a DataLad power-user.
Write a note about configurations in datasets into ``notes.txt``.
@@ -540,13 +495,13 @@ Write a note about configurations in datasets into ``notes.txt``.
extension (such as ``.txt``, ``.pdf``, ``.jpg``) for the operating system to know
how to open or use this file (in contrast to Windows, which does not know how to
open a file without an extension). To do this, Unix systems rely on a file's
- MIME type -- an information about a file's content. A ``.txt`` file for example
+ MIME type -- an information about a file's content. A ``.txt`` file, for example,
has MIME type ``text/plain`` as does a bash script (``.sh``), a Python
script has MIME type ``text/x-python``, a ``.jpg`` file is ``image/jpg``, and
a ``.pdf`` file has MIME type ``application/pdf``. You can find out the MIME type
of a file by running:
- .. code-block:: bash
+ .. code-block:: console
$ file --mime-type path/to/file
@@ -556,7 +511,7 @@ Write a note about configurations in datasets into ``notes.txt``.
specified in there take precedence over rules in ``.gitattributes``. You can set
them using the :gitcmd:`config` command:
- .. code-block:: bash
+ .. code-block:: console
$ git config annex.largefiles 'largerthan=100kb and not (include=*.c or include=*.h)'
diff --git a/docs/basics/101-124-procedures.rst b/docs/basics/101-124-procedures.rst
index 9185e2ffa..4336962ec 100644
--- a/docs/basics/101-124-procedures.rst
+++ b/docs/basics/101-124-procedures.rst
@@ -1,10 +1,9 @@
+.. index:: ! procedures, run-procedures
.. _procedures:
Configurations to go
--------------------
-.. index:: ! procedures, run-procedures
-
The past two sections should have given you a comprehensive
overview on the different configuration options the tools
Git, git-annex, and DataLad provide. They not only
@@ -33,7 +32,7 @@ This particular procedure lives in a script called
in this script is not large, and the relevant lines of code
are highlighted:
-.. code-block:: bash
+.. code-block:: python
:emphasize-lines: 12, 16-17
import sys
@@ -64,7 +63,7 @@ Just like ``cfg_text2git``, all DataLad procedures are
executables (such as a script, or compiled code).
In principle, they can be written in any language, and perform
any task inside of a dataset.
-The ``text2git`` configuration for example applies a configuration for how
+The ``text2git`` configuration, for example, applies a configuration for how
git-annex treats different file types. Other procedures do not
only modify ``.gitattributes``, but can also populate a dataset
with particular content, or automate routine tasks such as
@@ -73,7 +72,7 @@ What makes them a particularly versatile and flexible tool is
that anyone can write their own procedures.
If a workflow is a standard in a team and needs to be applied often, turning it into
a script can save time and effort.
-To learn how to do this, read the :ref:`with a tutorial on writing own procedures `.
+To learn how to do this, read the :ref:`tutorial on writing own procedures `.
By pointing DataLad to the location the procedures reside in they can be applied, and by
including them in a dataset they can even be shared.
And even if the script is simple, it is very handy to have preconfigured
@@ -85,8 +84,10 @@ spares naive users the necessity to learn about the ``.gitattributes``
file when setting up a dataset.
-.. index:: ! datalad command; run-procedure
-
+.. index::
+ pair: run-procedure; DataLad command
+ pair: discover dataset procedures; with DataLad
+ pair: discover; dataset procedure
To find out available procedures, the command
:dlcmd:`run-procedure --discover` is helpful.
@@ -113,6 +114,10 @@ they are all part of the source code of DataLad [#f1]_.
- ``cfg_metadatatypes`` lets users configure additional metadata
types.
+.. index::
+ pair: run dataset procedure; with DataLad
+ pair: run; dataset procedure
+
Applying procedures
^^^^^^^^^^^^^^^^^^^
@@ -126,6 +131,10 @@ with the ``-d/--dataset`` flag:
datalad run-procedure [-d ] cfg_text2git
+.. index::
+ pair: run dataset procedure on dataset creation; with DataLad
+ pair: run on dataset creation; dataset procedure
+
The typical workflow is to create a dataset and apply
a procedure afterwards.
However, some procedures shipped with DataLad or its extensions with a
@@ -135,9 +144,9 @@ command. This is a peculiarity of these procedures because, by convention,
all of these procedures are written to not require arguments.
The command structure looks like this:
-.. code-block:: bash
+.. code-block:: console
- datalad create -c text2git DataLad-101
+ $ datalad create -c text2git DataLad-101
Note that the ``cfg_`` prefix of the procedures is omitted in these
calls to keep it extra simple and short. The
@@ -147,15 +156,17 @@ could thus be applied within a :dlcmd:`create` as
- ``datalad create -c yoda ``
- ``datalad create -c text2git ``
+.. index:: dataset procedure; apply more than one configuration
.. find-out-more:: Applying multiple procedures
If you want to apply several configurations at once, feel free to do so,
for example like this:
- .. code-block:: bash
+ .. code-block:: console
$ datalad create -c yoda -c text2git
+.. index:: dataset procedure; apply to subdatasets
.. find-out-more:: Applying procedures in subdatasets
Procedures can be applied in datasets on any level in the dataset hierarchy, i.e.,
@@ -171,7 +182,7 @@ could thus be applied within a :dlcmd:`create` as
As a general note, it can be useful to apply procedures
early in the life of a dataset. Procedures such
-as ``cfg_yoda`` (explained in detail in section :ref:`yoda`),
+as ``cfg_yoda``, explained in detail in section :ref:`yoda`,
create files, change ``.gitattributes``, or apply other configurations.
If many other (possibly complex) configurations are
already in place, or if files of the same name as the ones created by
@@ -182,7 +193,13 @@ to a default dataset in which one has saved many text files already
files into Git -- only those text files created *after* the configuration
was applied.
-
+.. index::
+ single: configuration item; datalad.locations.system-procedures
+ single: configuration item; datalad.locations.user-procedures
+ single: configuration item; datalad.locations.dataset-procedures
+ single: configuration item; datalad.procedures..call-format
+ single: configuration item; datalad.procedures..help
+ single: datasets procedures; write your own
.. find-out-more:: Write your own procedures
:name: fom-procedures
:float:
@@ -191,8 +208,7 @@ was applied.
write their own ones in addition, and deploy them on individual machines,
or ship them within DataLad datasets. This allows to
automate routine configurations or tasks in a dataset, or share configurations that would otherwise not "stick" to the dataset.
- Some general rules for creating a custom procedure are outlined
- below:
+ Here are some general rules for creating a custom procedure:
- A procedure can be any executable. Executables must have the
appropriate permissions and, in the case of a script,
@@ -211,7 +227,7 @@ was applied.
a short "help" description on what the procedure does. Below is a minimal
``.datalad/config`` entry for a custom procedure:
- .. code-block:: bash
+ .. code-block:: ini
[datalad "procedures."]
help = This is a string to describe what the procedure does
@@ -233,7 +249,7 @@ was applied.
or ``datalad.locations.dataset-procedures`` (for changing the *local* default).
An example ``.datalad/config`` entry for the local scope is shown below.
- .. code-block:: bash
+ .. code-block:: ini
[datalad "locations"]
dataset-procedures = relative/path/from/dataset-root
@@ -241,7 +257,7 @@ was applied.
- By default, DataLad will call a procedure with a standard template
defined by a format string:
- .. code-block:: bash
+ .. code-block::
interpreter {script} {ds} {arguments}
@@ -251,7 +267,7 @@ was applied.
An example ``.datalad/config`` entry with a changed call format string
is shown below.
- .. code-block:: bash
+ .. code-block:: ini
[datalad "procedures."]
help = This is a string to describe what the procedure does
@@ -335,14 +351,14 @@ was applied.
:language: console
:workdir: procs/somedataset
- #the directory structure has been created
+ $ # the directory structure has been created
$ tree
.. runrecord:: _examples/DL-101-124-108
:workdir: procs/somedataset
:language: console
- #lets check out the contents in the files
+ $ # lets check out the contents in the files
$ cat example2 && echo '' && cat somedir/example
.. runrecord:: _examples/DL-101-124-109
diff --git a/docs/basics/101-125-summary.rst b/docs/basics/101-125-summary.rst
index 8a2550245..6f0a4c46a 100644
--- a/docs/basics/101-125-summary.rst
+++ b/docs/basics/101-125-summary.rst
@@ -14,7 +14,7 @@ your horizon about configurations of datasets:
``.datalad/config`` that apply to a specific dataset, but are committed and
therefore distributed. More specialized scopes take precedence over more global scopes.
-- Almost all configurations can be set with the :gitcmd:`config`.
+- Almost all configurations can be set with the :gitcmd:`config` command.
Its structure looks like this:
.. code-block:: bash
@@ -32,7 +32,7 @@ your horizon about configurations of datasets:
in a :gitcmd:`config` command.
- The ``.gitattributes`` file is the only configuration file the :gitcmd:`config`
- can not write to, because it has a different layout. However, run-procedures or
+ command cannot write to, because it has a different layout. However, run-procedures or
the user can write simple rules into it that determine which files are annexed
and which are stored in Git.
diff --git a/docs/basics/101-126-intro.rst b/docs/basics/101-126-intro.rst
index d249507df..1824c3712 100644
--- a/docs/basics/101-126-intro.rst
+++ b/docs/basics/101-126-intro.rst
@@ -1,6 +1,6 @@
.. _intromidterm:
-A Data Analysis Project with DataLad
+A data analysis project with DataLad
------------------------------------
diff --git a/docs/basics/101-127-yoda.rst b/docs/basics/101-127-yoda.rst
index 92e0c41df..cd88ea83b 100644
--- a/docs/basics/101-127-yoda.rst
+++ b/docs/basics/101-127-yoda.rst
@@ -1,11 +1,10 @@
+.. index:: ! YODA principles
.. _2-001:
.. _yoda:
YODA: Best practices for data analyses in a dataset
---------------------------------------------------
-.. index:: ! YODA principles
-
The last requirement for the midterm projects reads "needs to comply to the
YODA principles".
"What are the YODA principles?" you ask, as you have never heard of this
@@ -33,7 +32,7 @@ such as ``results/``, ``results_August19/``, ``results_revision/`` and
``now_with_nicer_plots/``. Something like this is a very
common shape a data science project may take after a while:
-.. code-block:: bash
+.. code-block:: console
├── code/
│ ├── code_final/
@@ -48,10 +47,10 @@ common shape a data science project may take after a while:
│ └── main_script_DONTUSE.py
├── data/
│ ├── data_updated/
- │ │ └── dataset1/
- │ │ ├── datafile_a
+ │ │ └── dataset1/
+ │ │ └── datafile_a
│ ├── dataset1/
- │ │ ├── datafile_a
+ │ │ └── datafile_a
│ ├── outputs/
│ │ ├── figures/
│ │ │ ├── figures_new.py
@@ -129,10 +128,10 @@ computational environments, results, ...) in dedicated directories. For example:
- Store scripts or **code** used for the analysis of data in a dedicated ``code/``
directory, outside of the data component of the dataset.
-- Collect **results** of an analysis in a dedicated ``outputs/`` directory, and
+- Collect **results** of an analysis in a dedicated place, outside of the ``inputs/`` directory, and
leave the input data of an analysis untouched by your computations.
-- Include a place for complete **execution environments**, for example
+- Include a place for complete **execution environments**, such as
`singularity images `_ or
`docker containers `_ [#f2]_, in
the form of an ``envs/`` directory, if relevant for your analysis.
@@ -143,7 +142,7 @@ computational environments, results, ...) in dedicated directories. For example:
This, for example, would be a directory structure from the root of a
superdataset of a very comprehensive data analysis project complying to the YODA principles:
-.. code-block:: bash
+.. code-block:: console
├── ci/ # continuous integration configuration
│ └── .travis.yml
@@ -162,15 +161,16 @@ superdataset of a very comprehensive data analysis project complying to the YODA
│ │ └── datafile_a
│ └── dataset2/
│ └── datafile_a
- ├── outputs/ # outputs away from the input data
- │ └── important_results/
- │ └── figures/
+ ├── important_results/ # outputs away from the input data
+ │ └── figures/
├── CHANGELOG.md # notes for fellow humans about your project
├── HOWTO.md
└── README.md
You can get a few non-DataLad related advice for structuring your directories in the :ref:`on best practices for analysis organization `.
+.. index::
+ pair: recommendation; dataset content organization
.. find-out-more:: More best practices for organizing contents in directories
:name: fom-yodaproject
:float:
@@ -181,7 +181,7 @@ You can get a few non-DataLad related advice for structuring your directories in
#. Within ``code/``, it is best practice to add **tests** for the code.
These tests can be run to check whether the code still works.
- #. It is even better to further use automated computing, for example
+ #. It is even better to further use automated computing such as
`continuous integration (CI) systems `_,
to test the functionality of your functions and scripts automatically.
If relevant, the setup for continuous integration frameworks (such as
@@ -189,7 +189,7 @@ You can get a few non-DataLad related advice for structuring your directories in
in a dedicated ``ci/`` directory.
#. Include **documents for fellow humans**: Notes in a README.md or a HOWTO.md,
- or even proper documentation (for example using in a dedicated ``docs/`` directory.
+ or even proper documentation (for example, using in a dedicated ``docs/`` directory.
Within these documents, include all relevant metadata for your analysis. If you are
conducting a scientific study, this might be authorship, funding,
change log, etc.
@@ -211,7 +211,7 @@ for a whole analysis dataset. At one point you might also write a
scientific paper about your analysis in a paper project, and the
whole analysis project can easily become a modular component in a paper
project, to make sharing paper, code, data, and results easy.
-The usecase :ref:`usecase_reproducible_paper` contains a step-by-step instruction on
+The use case :ref:`usecase_reproducible_paper` contains a step-by-step instruction on
how to build and share such a reproducible paper, if you want to learn
more.
@@ -227,7 +227,7 @@ more.
The directory tree above and :numref:`dataset_modules` highlight different aspects
of this principle. The directory tree illustrates the structure of
the individual pieces on the file system from the point of view of
-a single top-level dataset with a particular purpose. It for example
+a single top-level dataset with a particular purpose. For example, it
could be an analysis dataset created by a statistician for a scientific
project, and it could be shared between collaborators or
with others during development of the project. In this
@@ -279,8 +279,8 @@ be included in an analysis superdataset as subdatasets. Thanks to
:dlcmd:`clone`, information on the source of these subdatasets
is stored in the history of the analysis superdataset, and they can even be
updated from those sources if the original data dataset gets extended or changed.
-If you are including a file, for example code from GitHub,
-the :dlcmd:`download-url` command (introduced in section :ref:`populate`)
+If you are including a file, for example, code from GitHub,
+the :dlcmd:`download-url` command, introduced in section :ref:`populate`,
will record the source of it safely in the dataset's history. And if you add anything to your dataset,
from simple incremental coding progress in your analysis scripts up to
files that a colleague sent you via email, a plain :dlcmd:`save`
@@ -290,12 +290,13 @@ on its own already.
One core aspect of this principle is *linking* between reusable data
resource units (i.e., DataLad subdatasets containing pure data). You will
be happy to hear that this is achieved by simply installing datasets
-as subdatasets.
+as subdatasets, as :numref:`fig-subds` shows.
This part of this principle will therefore be absolutely obvious to you
because you already know how to install and nest datasets within datasets.
"I might just overcome my impostor syndrome if I experience such advanced
reproducible analysis concepts as being obvious", you think with a grin.
+.. _fig-subds:
.. figure:: ../artwork/src/img/data_origin.svg
:width: 50%
@@ -323,7 +324,9 @@ everything that is not part of the data according to principle 1.
Conducting transparent open science is easier if you can link code, data,
and results within a dataset, and share everything together. In conjunction
with principle 1, this means that you can distribute your analysis projects
-(or parts of it) in a comprehensible form.
+(or parts of it) in a comprehensible form, exemplified in :numref:`fig-yodads`.
+
+.. _fig-yodads:
.. figure:: ../artwork/src/img/decentralized_publishing.svg
:figwidth: 100%
@@ -357,15 +360,15 @@ which result was generated when, by which author, from which inputs,
and by means of which command.
With another DataLad command one can even go one step further:
-The command :dlcmd:`containers-run` (it will be introduced in
-section :ref:`containersrun`) performs a command execution within
+The command :dlcmd:`containers-run` - it will be introduced in
+section :ref:`containersrun` - performs a command execution within
a configured containerized environment. Thus, not only inputs,
outputs, command, time, and author, but also the *software environment*
are captured as provenance of a dataset component such as a results file,
and, importantly, can be shared together with the dataset in the
form of a software container.
-Tip: Make use of ``datalad run``'s ``--dry-run`` option to craft your run-command (see :ref:`dryrun`)!
+Tip: Make use of ``datalad run``'s ``--dry-run`` option to craft your run-command, as outlined in :ref:`dryrun`!
With this last principle, your dataset collects and stores provenance
of all the contents you created in the wake of your analysis project.
@@ -383,7 +386,7 @@ from section :ref:`createds`, the ``yoda`` procedure can be included in a
:dlcmd:`create` command and will apply useful configurations
to your dataset:
-.. code-block:: bash
+.. code-block:: console
$ datalad create -c yoda "my_analysis"
@@ -395,7 +398,7 @@ to your dataset:
Let's take a look at what configurations and changes come with this procedure:
-.. code-block:: bash
+.. code-block:: console
$ tree -a
@@ -409,7 +412,7 @@ Let's take a look at what configurations and changes come with this procedure:
Let's take a closer look into the ``.gitattributes`` files:
-.. code-block:: bash
+.. code-block:: console
$ less .gitattributes
diff --git a/docs/basics/101-130-yodaproject.rst b/docs/basics/101-130-yodaproject.rst
index 88e3bc1b8..f713e305b 100644
--- a/docs/basics/101-130-yodaproject.rst
+++ b/docs/basics/101-130-yodaproject.rst
@@ -14,6 +14,9 @@ the `Python `__ programming language, you decide
to script your analysis in Python. Delighted, you find out that there is even
a Python API for DataLad's functionality that you can read about in :ref:`a Findoutmore on DataLad in Python`.
+.. _pythonapi:
+.. index::
+ pair: use DataLad API; with Python
.. find-out-more:: DataLad's Python API
:name: fom-pythonapi
:float:
@@ -125,7 +128,7 @@ a Python API for DataLad's functionality that you can read about in :ref:`a Find
'refds': '/home/me/my-ds',
'action': 'status'}]
- .. code-block:: bash
+ .. code-block:: console
$ datalad -f json_pp status myfile
{"action": "status",
@@ -140,10 +143,13 @@ a Python API for DataLad's functionality that you can read about in :ref:`a Find
"type": "file"}
+.. index::
+ pair: use DataLad API; with Matlab
+ pair: use DataLad API; with R
.. importantnote:: Use DataLad in languages other than Python
While there is a dedicated API for Python, DataLad's functions can of course
- also be used with other programming languages, such as Matlab, via standard
+ also be used with other programming languages, such as Matlab, or R, via standard
system calls.
Even if you do not know or like Python, you can just copy-paste the code
@@ -158,10 +164,12 @@ of the flowers in centimeters. It is often used in introductory data science
courses for statistical classification techniques in machine learning, and
widely available -- a perfect dataset for your midterm project!
+.. index::
+ pair: reproducible paper; with DataLad
.. importantnote:: Turn data analysis into dynamically generated documents
Beyond the contents of this section, we have transformed the example analysis also into a template to write a reproducible paper.
- If you're interested in checking that out, please head over to `github.com/datalad-handbook/repro-paper-sketch/ `_.
+ If you are interested in checking that out, please head over to `github.com/datalad-handbook/repro-paper-sketch/ `_.
Raw data as a modular, independent entity
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -181,6 +189,8 @@ dataset at `https://github.com/datalad-handbook/iris_data `.
+.. index::
+ pair: create and publish dataset as dependency; with DataLad
.. find-out-more:: Creating an independent input dataset
:name: fom-iris
@@ -201,7 +211,7 @@ independent dataset from scratch in a :ref:`dedicated Findoutmore `.
:env:
DATALAD_SEED=1
- # make sure to move outside of DataLad-101!
+ $ # make sure to move outside of DataLad-101!
$ cd ../
$ datalad create iris_data
@@ -253,10 +263,12 @@ you use the ``cfg_yoda`` procedure to help you structure the dataset [#f1]_:
DATALAD_SEED=2
:notes: Let's create a data analysis project with a yoda procedure
- # inside of DataLad-101
+ $ # inside of DataLad-101
$ datalad create -c yoda --dataset . midterm_project
-.. index:: ! datalad command; datalad subdatasets
+.. index::
+ pair: subdatasets; DataLad command
+ pair: list subdatasets; with DataLad
The :dlcmd:`subdatasets` command can report on which subdatasets exist for
``DataLad-101``. This helps you verify that the command succeeded and the
@@ -284,7 +296,7 @@ by installing it as a subdataset. Make sure to install it as a subdataset of
:notes: Now clone input data as a subdataset
$ cd midterm_project
- # we are in midterm_project, thus -d . points to the root of it.
+ $ # we are in midterm_project, thus -d . points to the root of it.
$ datalad clone -d . \
https://github.com/datalad-handbook/iris_data.git \
input/
@@ -307,12 +319,14 @@ looks like this:
Importantly, all of the subdatasets are linked to the higher-level datasets,
and despite being inside of ``DataLad-101``, your ``midterm_project`` is an independent
-dataset, as is its ``input/`` subdataset:
+dataset, as is its ``input/`` subdataset. An overview is shown in :numref:`fig-linkeddl101`.
+
+.. _fig-linkeddl101:
.. figure:: ../artwork/src/virtual_dstree_dl101_midterm.svg
- :alt: Overview of (linked) datasets in DataLad-101.
:width: 50%
+ Overview of (linked) datasets in DataLad-101.
YODA-compliant analysis scripts
@@ -341,7 +355,7 @@ To compute the analysis you create the following Python script inside of ``code/
.. runrecord:: _examples/DL-101-130-107
:language: console
:workdir: dl-101/DataLad-101/midterm_project
- :emphasize-lines: 11-13, 23, 42
+ :emphasize-lines: 11-13, 23, 43
:cast: 10_yoda
:notes: Let's create code for an analysis
@@ -399,9 +413,9 @@ This script will
A short help text explains how the script shall be used:
-.. code-block:: bash
+.. code-block:: console
- python code/script.py -h 2 !
+ $ python code/script.py -h
usage: script.py [-h] data output_figure output_report
Analyze iris data
@@ -427,6 +441,10 @@ Let's run a quick :dlcmd:`status`...
$ datalad status
+
+.. index::
+ pair: tag dataset version; with DataLad
+
... and save the script to the subdataset's history. As the script completes your
analysis setup, we *tag* the state of the dataset to refer to it easily at a later
point with the ``--version-tag`` option of :dlcmd:`save`.
@@ -441,6 +459,10 @@ point with the ``--version-tag`` option of :dlcmd:`save`.
--version-tag ready4analysis \
code/script.py
+.. index::
+ pair: tag; Git concept
+ pair: show; Git command
+ pair: rerun command; with DataLad
.. find-out-more:: What is a tag?
:term:`tag`\s are markers that you can attach to commits in your dataset history.
@@ -460,9 +482,9 @@ point with the ``--version-tag`` option of :dlcmd:`save`.
was added.
Later we can use this tag to identify the point in time at which
the analysis setup was ready -- much more intuitive than a 40-character shasum!
- This is handy in the context of a :dlcmd:`rerun` for example:
+ This is handy in the context of a :dlcmd:`rerun`, for example:
- .. code-block:: bash
+ .. code-block:: console
$ datalad rerun --since ready4analysis
@@ -485,25 +507,21 @@ re-execution with :dlcmd:`rerun` easy.
- `seaborn `_
- `sklearn `_
- The packages can be installed via ``pip`` [#f3]_.
+ The packages can be installed via :term:`pip`.
However, if you do not want to install any
Python packages, do not execute the remaining code examples in this section
-- an upcoming section on ``datalad containers-run`` will allow you to
perform the analysis without changing your Python software-setup.
-.. windows-wit:: You may need to use "python", not "python3"
-
- If executing the code below returns an exit code of 9009, there may be no ``python3`` -- instead, it is called solely ``python``.
- Please run the following instead (adjusted for line breaks, you should be able to copy-paste this as a whole):
- .. code-block:: bash
+.. index::
+ pair: python instead of python3; on Windows
+.. windows-wit:: You may need to use 'python', not 'python3'
- datalad run -m "analyze iris data with classification analysis" ^
- --input "input/iris.csv" ^
- --output "pairwise_relationships.png" ^
- --output "prediction_report.csv" ^
- "python code/script.py {inputs} {outputs}"
+ .. include:: topic/py-or-py3.rst
+.. index::
+ pair: run command with provenance capture; with DataLad
.. runrecord:: _examples/DL-101-130-111
:language: console
:workdir: dl-101/DataLad-101/midterm_project
@@ -543,7 +561,7 @@ dataset:
$ git log --oneline
-"Wow, this is so clean an intuitive!" you congratulate yourself. "And I think
+"Wow, this is so clean and intuitive!" you congratulate yourself. "And I think
this was and will be the fastest I have ever completed a midterm project!"
But what is still missing is a human readable description of your dataset.
The YODA procedure kindly placed a ``README.md`` file into the root of your
@@ -556,7 +574,7 @@ dataset that you can use for this [#f4]_.
:cast: 10_yoda
:notes: create human readable information for your project
- # with the >| redirection we are replacing existing contents in the file
+ $ # with the >| redirection we are replacing existing contents in the file
$ cat << EOT >| README.md
# Midterm YODA Data Analysis Project
@@ -588,13 +606,15 @@ dataset that you can use for this [#f4]_.
$ datalad save -m "Provide project description" README.md
Note that one feature of the YODA procedure was that it configured certain files
-(for example everything inside of ``code/``, and the ``README.md`` file in the
+(for example, everything inside of ``code/``, and the ``README.md`` file in the
root of the dataset) to be saved in Git instead of git-annex. This was the
reason why the ``README.md`` in the root of the dataset was easily modifiable.
+.. index::
+ pair: save; DataLad command
+ pair: save file content directly in Git (no annex); with DataLad
.. find-out-more:: Saving contents with Git regardless of configuration with --to-git
- .. index:: ! datalad command; save --to-git
The ``yoda`` procedure in ``midterm_project`` applied a different configuration
within ``.gitattributes`` than the ``text2git`` procedure did in ``DataLad-101``.
@@ -610,7 +630,7 @@ reason why the ``README.md`` in the root of the dataset was easily modifiable.
require you to edit configurations in ``.gitattributes``: The ``--to-git``
option for :dlcmd:`save`.
- .. code-block:: bash
+ .. code-block:: console
$ datalad save -m "add sometextfile.txt" --to-git sometextfile.txt
@@ -620,6 +640,7 @@ everything you did easily.
The only thing left to do is to hand in your assignment. According to the
syllabus, this should be done via :term:`GitHub`.
+.. index:: dataset hosting; GitHub
.. find-out-more:: What is GitHub?
GitHub is a web based hosting service for Git repositories. Among many
@@ -631,12 +652,15 @@ syllabus, this should be done via :term:`GitHub`.
Web-hosting services like GitHub and :term:`GitLab` integrate wonderfully with
DataLad. They are especially useful for making your dataset publicly available,
if you have figured out storage for your large files otherwise (as large content
- can not be hosted for free by GitHub). You can make DataLad publish large file content to one location
+ cannot be hosted for free by GitHub). You can make DataLad publish large file content to one location
and afterwards automatically push an update to GitHub, such that
users can install directly from GitHub/GitLab and seemingly also obtain large file
content from GitHub. GitHub can also resolve subdataset links to other GitHub
repositories, which lets you navigate through nested datasets in the web-interface.
+ ..
+ the images below can't become figures because they can't be used in LaTeXs minipage environment
+
.. image:: ../artwork/src/screenshot_midtermproject.png
:alt: The midterm project repository, published to GitHub
@@ -647,7 +671,8 @@ syllabus, this should be done via :term:`GitHub`.
.. image:: ../artwork/src/screenshot_submodule.png
:alt: The input dataset is linked
-.. index:: ! datalad command; create-sibling-github
+.. index::
+ pair: create-sibling-github; DataLad command
.. _publishtogithub:
Publishing the dataset to GitHub
@@ -660,7 +685,8 @@ For this, you need to
- configure this GitHub repository to be a :term:`sibling` of the ``midterm_project`` dataset,
- and *publish* your dataset to GitHub.
-.. index:: ! datalad command; create-sibling-gitlab
+.. index::
+ pair: create-sibling-gitlab; DataLad command
Luckily, DataLad can make this very easy with the
:dlcmd:`create-sibling-github`
@@ -672,10 +698,11 @@ The command takes a repository name and GitHub authentication credentials
(either in the command line call with options ``github-login ``, with an *oauth* `token `_ stored in the Git
configuration, or interactively).
+.. index::
+ pair: GitHub token; credential
.. importantnote:: Generate a GitHub token
- GitHub `deprecated user-password authentication `_ supports authentication via personal access token only.
-
+ GitHub `deprecated user-password authentication `_ and instead supports authentication via personal access token.
To ensure successful authentication, don't supply your password, but create a personal access token at `github.com/settings/tokens `_ [#f6]_ instead, and either
* supply the token with the argument ``--github-login `` from the command line,
@@ -693,12 +720,14 @@ configure this repository as a sibling of the dataset:
$ python3 /home/me/makepushtarget.py '/home/me/dl-101/DataLad-101/midterm_project' 'github' '/home/me/pushes/midterm_project' False True
+
+.. index:: credential; entry
+ pair: typed credentials are not displayed; on Windows
.. windows-wit:: Your shell will not display credentials
- Don't be confused if you are prompted for your GitHub credentials, but can't seem to type -- the terminal protects your private information by not displaying what you type.
- Simply type in what is requested, and press enter.
+ .. include:: topic/credential-nodisplay.rst
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-github -d . midtermproject
.: github(-) [https://github.com/adswa/midtermproject.git (git)]
@@ -707,13 +736,15 @@ configure this repository as a sibling of the dataset:
Verify that this worked by listing the siblings of the dataset:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
[WARNING] Failed to determine if github carries annex.
.: here(+) [git]
.: github(-) [https://github.com/adswa/midtermproject.git (git)]
+.. index::
+ pair: sibling (GitHub); DataLad concept
.. gitusernote:: Create-sibling-github internals
Creating a sibling on GitHub will create a new empty repository under the
@@ -721,7 +752,9 @@ Verify that this worked by listing the siblings of the dataset:
:dlcmd:`push` to this sibling, your datasets history
will be pushed there.
- .. index:: ! datalad command; push
+.. index::
+ pair: push; DataLad concept
+ pair: push (dataset); with DataLad
On GitHub, you will see a new, empty repository with the name
``midtermproject``. However, the repository does not yet contain
@@ -736,7 +769,7 @@ command.
proportion of the previous handbook content as a prerequisite. In order to be
not too overwhelmingly detailed, the upcoming sections will approach
:dlcmd:`push` from a "learning-by-doing" perspective:
- You will see a first :dlcmd:`push` to GitHub below, and the :ref:`Findoutmore on the published dataset `
+ First, you will see a :dlcmd:`push` to GitHub, and the :ref:`Findoutmore on the published dataset `
at the end of this section will already give a practical glimpse into the
difference between annexed contents and contents stored in Git when pushed
to GitHub. The chapter :ref:`chapter_thirdparty` will extend on this,
@@ -760,7 +793,7 @@ command.
$ datalad push --to github
Thus, you have now published your dataset's history to a public place for others
-to see and clone. Below we will explore how this may look and feel for others.
+to see and clone. Now we will explore how this may look and feel for others.
There is one important detail first, though: By default, your tags will not be published.
Thus, the tag ``ready4analysis`` is not pushed to GitHub, and currently this
@@ -769,10 +802,15 @@ The reason for this is that tags are viral -- they can be removed locally, and o
published tags can cause confusion or unwanted changes. In order to publish a tag,
an additional :gitcmd:`push` with the ``--tags`` option is required:
-.. code-block:: bash
+.. index::
+ pair: push; DataLad concept
+ pair: push (tag); with Git
+.. code-block:: console
$ git push github --tags
+.. index::
+ pair: push (tag); with DataLad
.. gitusernote:: Pushing tags
Note that this is a :gitcmd:`push`, not :dlcmd:`push`.
@@ -783,13 +821,13 @@ an additional :gitcmd:`push` with the ``--tags`` option is required:
configuration would push all tags that start with a ``v`` upon a
:dlcmd:`push --to github`:
- .. code-block:: bash
+ .. code-block:: console
$ git config --local remote.github.push 'refs/tags/v*'
This configuration would result in the following entry in ``.git/config``:
- .. code-block:: bash
+ .. code-block:: ini
[remote "github"]
url = git@github.com/adswa/midtermproject.git
@@ -801,6 +839,9 @@ Yay! Consider your midterm project submitted! Others can now install your
dataset and check out your data science project -- and even better: they can
reproduce your data science project easily from scratch (take a look into the :ref:`Findoutmore ` to see how)!
+.. index::
+ pair: work on published YODA dataset; with DataLad
+ pair: rerun command; with DataLad
.. find-out-more:: On the looks and feels of this published dataset
:name: fom-midtermclone
:float:
@@ -810,7 +851,7 @@ reproduce your data science project easily from scratch (take a look into the :r
Therefore, you decide to install this dataset into a new location on your
computer, just to get a feel for it.
- Replace the ``url`` in the :dlcmd:`clone` command below with the path
+ Replace the ``url`` in the :dlcmd:`clone` command with the path
to your own ``midtermproject`` GitHub repository, or clone the "public"
``midterm_project`` repository that is available via the Handbook's GitHub
organization at `github.com/datalad-handbook/midterm_project `_:
@@ -833,7 +874,7 @@ reproduce your data science project easily from scratch (take a look into the :r
$ cd midtermproject
$ datalad get input/iris.csv
- Nice, this worked well. The output files, however, can not be easily
+ Nice, this worked well. The output files, however, cannot be easily
retrieved:
.. runrecord:: _examples/DL-101-130-121
@@ -858,7 +899,7 @@ reproduce your data science project easily from scratch (take a look into the :r
:dlcmd:`rerun` command. If the tag was published we can simply
rerun any :dlcmd:`run` command since this tag:
- .. code-block:: bash
+ .. code-block:: console
$ datalad rerun --since ready4analysis
@@ -876,17 +917,20 @@ reproduce your data science project easily from scratch (take a look into the :r
With this, you realize again how letting DataLad take care of linking input,
output, and code can make your life and others' lives so much easier.
Applying the YODA principles to your data analysis was very beneficial indeed.
- Proud of your midterm project you can not wait to use those principles the
+ Proud of your midterm project you cannot wait to use those principles the
next time again.
.. image:: ../artwork/src/reproduced.svg
:width: 50%
+ :align: center
+.. index::
+ pair: push; DataLad concept
.. gitusernote:: Push internals
The :dlcmd:`push` uses ``git push``, and ``git annex copy`` under
the hood. Publication targets need to either be configured remote Git repositories,
- or git-annex special remotes (if they support data upload).
+ or git-annex :term:`special remote`\s (if they support data upload).
.. only:: adminmode
@@ -906,7 +950,7 @@ reproduce your data science project easily from scratch (take a look into the :r
creation of the dataset with ``-c yoda``, but also after creation
with the :dlcmd:`run-procedure` command:
- .. code-block:: bash
+ .. code-block:: console
$ cd midterm_project
$ datalad run-procedure cfg_yoda
@@ -922,16 +966,16 @@ reproduce your data science project easily from scratch (take a look into the :r
`virtual environment `_ and
install the required Python packages inside of it:
- .. code-block:: bash
+ .. code-block:: console
- # create and enter a new virtual environment (optional)
+ $ # create and enter a new virtual environment (optional)
$ virtualenv --python=python3 ~/env/handbook
$ . ~/env/handbook/bin/activate
- .. code-block:: bash
+ .. code-block:: console
- # install the Python packages from PyPi via pip
- pip install seaborn pandas sklearn
+ $ # install the Python packages from PyPi via pip
+ $ pip install seaborn pandas sklearn
.. [#f4] All ``README.md`` files the YODA procedure created are
version controlled by Git, not git-annex, thanks to the
@@ -944,8 +988,4 @@ reproduce your data science project easily from scratch (take a look into the :r
.. [#f5] Alternatively, if you were to use DataLad's Python API, you could import and expose it as ``dl.`` and ``dl.get()`` the relevant files. This however, would not record them as provenance in the dataset's history.
.. [#f6] Instead of using GitHub's WebUI you could also obtain a token using the command line GitHub interface (https://github.com/sociomantic-tsunami/git-hub) by running ``git hub setup`` (if no 2FA is used).
- If you decide to use the command line interface, here is help on how to use it:
- Clone the `GitHub repository `_ to your local computer.
- Decide whether you want to build a Debian package to install, or install the single-file Python script distributed in the repository.
- Make sure that all `requirements `_ for your preferred version are installed , and run either ``make deb`` followed by ``sudo dpkg -i deb/git-hub*all.deb``, or ``make install``.
diff --git a/docs/basics/101-132-advancednesting.rst b/docs/basics/101-132-advancednesting.rst
index 080151a81..4ca9e9898 100644
--- a/docs/basics/101-132-advancednesting.rst
+++ b/docs/basics/101-132-advancednesting.rst
@@ -1,10 +1,10 @@
+.. index::
+ pair: dataset nesting; DataLad concept
.. _nesting2:
-More on Dataset nesting
+More on dataset nesting
^^^^^^^^^^^^^^^^^^^^^^^
-.. index:: ! nesting
-
You may have noticed how working in the subdataset felt as if you would be
working in an independent dataset -- there was no information or influence at
all from the top-level ``DataLad-101`` superdataset, and you build up a
@@ -36,7 +36,7 @@ evolved. Let's query the superdataset what it thinks about this.
:language: console
:workdir: dl-101/DataLad-101/midterm_project
- # move into the superdataset
+ $ # move into the superdataset
$ cd ../
$ datalad status
@@ -45,23 +45,86 @@ From the superdataset's perspective, the subdataset appears as being
indeed the complete subdataset as a single entity.
What this shows you is that the modifications of the subdataset you performed are not
-automatically recorded to the superdataset. This makes sense -- after all it
-should be up to you to decide whether you want record something or not --,
-but it is worth repeating: If you modify a subdataset, you will need to save
+automatically recorded to the superdataset. This makes sense, after all it
+should be up to you to decide whether you want record something or not.
+But it is worth repeating: If you modify a subdataset, you will need to save
this *in the superdataset* in order to have a clean superdataset status.
-This point in time in DataLad-101 is a convenient moment to dive a bit deeper
+Let's save the modification of the subdataset into the history of the
+superdataset. For this, to avoid confusion, you can specify explicitly to
+which dataset you want to save a modification. ``-d .`` specifies the current
+dataset, i.e., ``DataLad-101``, as the dataset to save to:
+
+.. runrecord:: _examples/DL-101-132-103
+ :language: console
+ :workdir: dl-101/DataLad-101/
+
+ $ datalad save -d . -m "finished my midterm project" midterm_project
+
+.. index::
+ pair: save modification in nested dataset; with DataLad
+.. find-out-more:: More on how 'datalad save' can operate on nested datasets
+
+ In a superdataset with subdatasets, :dlcmd:`save` by default
+ tries to figure out on its own which dataset's history of all available
+ datasets a :dlcmd:`save` should be written to. However, it can reduce
+ confusion or allow specific operations to be very explicit in the command
+ call and tell DataLad where to save what kind of modifications to.
+
+ If you want to save the current state of the subdataset into the superdataset
+ (as necessary here), start a ``save`` from the superdataset and have the
+ ``-d/--dataset`` option point to its root:
+
+ .. code-block:: console
+
+ $ # in the root of the superds
+ $ datalad save -d . -m "update subdataset"
+
+ If you are in the superdataset, and you want to save an unsaved modification
+ in a subdataset to the *subdatasets* history, let ``-d/--dataset`` point to
+ the subdataset:
+
+ .. code-block:: console
+
+ $ # in the superds
+ $ datalad save -d path/to/subds -m "modified XY"
+
+ The recursive option allows you to save any content underneath the specified
+ directory, and recurse into any potential subdatasets:
+
+ .. code-block:: console
+
+ $ datalad save . --recursive
+
+Let's check which subproject commit is now recorded in the superdataset:
+
+.. runrecord:: _examples/DL-101-132-104
+ :language: console
+ :workdir: dl-101/DataLad-101/
+ :emphasize-lines: 14
+
+ $ git log -p -n 1
+
+As you can see in the log entry, the subproject commit changed from the
+first commit hash in the subdataset history to the most recent one. With this
+change, therefore, your superdataset tracks the most recent version of
+the ``midterm_project`` dataset, and your dataset's status is clean again.
+
+
+This time in DataLad-101 is a convenient moment to dive a bit deeper
into the functions of the :dlcmd:`status` command. If you are
interested in this, checkout the :ref:`dedicated Findoutmore `.
-.. find-out-more:: More on datalad status
+.. index::
+ pair: status; DataLad command
+ pair: check dataset for modification; with DataLad
+.. find-out-more:: More on 'datalad status'
:name: fom-status
:float:
First of all, let's start with a quick overview of the different content *types*
and content *states* various :dlcmd:`status` commands in the course
- of DataLad-101 have shown up to this point:
-
+ of DataLad-101 have shown up to this point.
You have seen the following *content types*:
- ``file``, e.g., ``notes.txt``: any file (or symlink that is a placeholder to an annexed file)
@@ -72,7 +135,7 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
that is properly registered in the superdataset
And you have seen the following *content states*: ``modified`` and ``untracked``.
- The section :ref:`filesystem` will show you many instances of ``deleted`` content
+ The section :ref:`file system` will show you many instances of ``deleted`` content
state as well.
But beyond understanding the report of :dlcmd:`status`, there is also
@@ -85,13 +148,12 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
When performed without any arguments, :dlcmd:`status` will report
the state of the current dataset. However, you can specify a path to any
sub- or superdataset with the ``--dataset`` option.
-
In order to demonstrate this a bit better, we will make sure that not only the
state of the subdataset *within* the superdataset is modified, but also that the
subdataset contains a modification. For this, let's add an empty text file into
the ``midterm_project`` subdataset:
- .. runrecord:: _examples/DL-101-132-103
+ .. runrecord:: _examples/DL-101-132-105
:language: console
:workdir: dl-101/DataLad-101
@@ -101,7 +163,7 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
*within* the subdataset, simply provide a path (relative to your current location)
to the command:
- .. runrecord:: _examples/DL-101-132-104
+ .. runrecord:: _examples/DL-101-132-106
:language: console
:workdir: dl-101/DataLad-101
@@ -111,7 +173,7 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
and provide a path to the subdataset *with a trailing path separator* like
this:
- .. runrecord:: _examples/DL-101-132-105
+ .. runrecord:: _examples/DL-101-132-107
:language: console
:workdir: dl-101/DataLad-101
@@ -120,13 +182,13 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
Note that both of these commands return only the ``untracked`` file and not
not the ``modified`` subdataset because we're explicitly querying only the
subdataset for its status.
- If you however, as done outside of this hidden section, you want to know about
+ If you however, as done outside of this Find-out-more, you want to know about
the subdataset record in the superdataset without causing a status query for
the state *within* the subdataset itself, you can also provide an explicit
path to the dataset (without a trailing path separator). This can be used
to specify a specific subdataset in the case of a dataset with many subdatasets:
- .. runrecord:: _examples/DL-101-132-106
+ .. runrecord:: _examples/DL-101-132-108
:language: console
:workdir: dl-101/DataLad-101
@@ -137,7 +199,7 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
the state of the subdataset within the superdataset, you can combine the
two paths:
- .. runrecord:: _examples/DL-101-132-107
+ .. runrecord:: _examples/DL-101-132-109
:language: console
:workdir: dl-101/DataLad-101
@@ -146,7 +208,7 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
Finally, if these subtle differences in the paths are not easy to memorize,
the ``-r/--recursive`` option will also report you both status aspects:
- .. runrecord:: _examples/DL-101-132-108
+ .. runrecord:: _examples/DL-101-132-110
:language: console
:workdir: dl-101/DataLad-101
@@ -155,88 +217,29 @@ interested in this, checkout the :ref:`dedicated Findoutmore `.
Importantly, the regular output from a :dlcmd:`status` command in the commandline is "condensed" to the most important information by a tailored result renderer.
You can, however, also get ``status``' unfiltered full output by switching the ``-f``/``--output-format`` from ``tailored`` (the default) to ``json`` or, for the same infos as ``json`` but better readability, ``json_pp``:
- .. runrecord:: _examples/DL-101-132-108a
+ .. runrecord:: _examples/DL-101-132-111
:language: console
:workdir: dl-101/DataLad-101
$ datalad -f json_pp status -d . midterm_project
This still was not all of the available functionality of the
- :dlcmd:`status` command. You could for example adjust whether and
+ :dlcmd:`status` command. You could, for example, adjust whether and
how untracked dataset content should be reported with the ``--untracked``
option, or get additional information from annexed content with the ``--annex``
option (especially powerful when combined with ``-f json_pp``). To get a complete overview on what you could do, check out the technical
documentation of :dlcmd:`status` `here `_.
- Before we leave this hidden section, lets undo the modification of the subdataset
+ Before we leave this Find-out-more, lets undo the modification of the subdataset
by removing the untracked file:
- .. runrecord:: _examples/DL-101-132-109
+ .. runrecord:: _examples/DL-101-132-112
:language: console
:workdir: dl-101/DataLad-101
$ rm midterm_project/an_empty_file
$ datalad status --recursive
-Let's save the modification of the subdataset into the history of the
-superdataset. For this, to avoid confusion, you can specify explicitly to
-which dataset you want to save a modification. ``-d .`` specifies the current
-dataset, i.e., ``DataLad-101``, as the dataset to save to:
-
-.. runrecord:: _examples/DL-101-132-110
- :language: console
- :workdir: dl-101/DataLad-101/
-
- $ datalad save -d . -m "finished my midterm project" midterm_project
-
-.. find-out-more:: More on how save can operate on nested datasets
-
- In a superdataset with subdatasets, :dlcmd:`save` by default
- tries to figure out on its own which dataset's history of all available
- datasets a :dlcmd:`save` should be written to. However, it can reduce
- confusion or allow specific operations to be very explicit in the command
- call and tell DataLad where to save what kind of modifications to.
-
- If you want to save the current state of the subdataset into the superdataset
- (as necessary here), start a ``save`` from the superdataset and have the
- ``-d/--dataset`` option point to its root:
-
- .. code-block:: bash
-
- # in the root of the superds
- $ datalad save -d . -m "update subdataset"
-
- If you are in the superdataset, and you want to save an unsaved modification
- in a subdataset to the *subdatasets* history, let ``-d/--dataset`` point to
- the subdataset:
-
- .. code-block:: bash
-
- # in the superds
- $ datalad save -d path/to/subds -m "modified XY"
-
- The recursive option allows you to save any content underneath the specified
- directory, and recurse into any potential subdatasets:
-
- .. code-block:: bash
-
- $ datalad save . --recursive
-
-Let's check which subproject commit is now recorded in the superdataset:
-
-.. runrecord:: _examples/DL-101-132-112
- :language: console
- :workdir: dl-101/DataLad-101/
- :emphasize-lines: 14
-
- $ git log -p -n 1
-
-As you can see in the log entry, the subproject commit changed from the
-first commit hash in the subdataset history to the most recent one. With this
-change, therefore, your superdataset tracks the most recent version of
-the ``midterm_project`` dataset, and your dataset's status is clean again.
-
-
.. only:: adminmode
Add a tag at the section end.
diff --git a/docs/basics/101-133-containersrun.rst b/docs/basics/101-133-containersrun.rst
index 8bfc9b60f..34dadee28 100644
--- a/docs/basics/101-133-containersrun.rst
+++ b/docs/basics/101-133-containersrun.rst
@@ -38,14 +38,29 @@ that can link computational environments to datasets, the
extension.
This section will give a quick overview on what containers are and
-demonstrate how ``datalad-containers`` helps to capture full provenance of an
+demonstrate how ``datalad-container`` helps to capture full provenance of an
analysis by linking containers to datasets and analyses.
+.. importantnote:: Install the datalad-container extension
+
+ This section uses the :term:`DataLad extension` ``datalad-container``.
+ As other extensions, it is a stand-alone Python package, and can be installed using :term:`pip`:
+
+ .. code-block:: bash
+
+ $ pip install datalad-container
+
+ As with DataLad and other Python packages, you might want to do the installation in a :term:`virtual environment`.
+
+
+.. index::
+ pair: recipe; software container concept
+ pair: image; software container concept
+ pair: container; software container concept
+
Containers
^^^^^^^^^^
-.. index:: ! software container, ! container
-
To put it simple, computational containers are cut-down virtual machines that
allow you to package all software libraries and their dependencies (all in the
precise version your analysis requires) into a bundle you can share with
@@ -53,13 +68,13 @@ others. On your own and other's machines, the container constitutes a secluded
software environment that
- contains the exact software environment that you specified, ready to run
- analyses in
+ analyses
- does not effect any software outside of the container
-Unlike virtual machines, software containers do not have their own operating
-system. Instead, they use basic services of the underlying operating system
-of the computer they run on (in a read-only fashion). This makes them
-lightweight and portable. By sharing software environments with containers,
+Unlike virtual machines, software containers do not run a full operating
+system on virtualized hardware. Instead, they use basic services of the host operating system
+(in a read-only fashion). This makes them
+lightweight and still portable. By sharing software environments with containers,
others (and also yourself) have easy access to the correct software
without the need to modify the software environment of the machine the
container runs on. Thus, containers are ideal to encapsulate the software
@@ -75,44 +90,60 @@ While being a powerful tool, it is only rarely used on high performance computin
.io/docs>`_.
Both of these tools share core terminology:
-**Recipe**
- A text file template that lists all required components of the computational environment.
+:term:`container recipe`
+ A text file that lists all required components of the computational environment.
It is made by a human user.
-**Image**
- This is *built* from the recipe file. It is a static filesystem inside a file,
+:term:`container image`
+ This is *built* from the recipe file. It is a static file system inside a file,
populated with the software specified in the recipe, and some initial configuration.
-**Container**
- A running instance of an Image that you can actually use for your computations.
+:term:`container`
+ A running instance of an image that you can actually use for your computations.
If you want to create and run your own software container, you start by writing
- a recipe file and build an Image from it. Alternatively, you can can also *pull*
- an Image built from a publicly shared recipe from the *Hub* of the tool you are using.
+ a recipe file and build an image from it. Alternatively, you can can also *pull*
+ an image built from a publicly shared recipe from the *Hub* of the tool you are using.
-**Hub**
+hub
A storage resource to share and consume images. Examples are
- `Singularity-Hub `_,
- `Docker-Hub `_, and `Amazon ECR `_ which hosts Docker Images.
+ :term:`Singularity-Hub`, :term:`Docker-Hub`, and `Amazon ECR `_ which hosts Docker images.
-Note that as of now, the ``datalad-containers`` extension supports
+Note that as of now, the ``datalad-container`` extension supports
Singularity and Docker images.
Singularity furthermore is compatible with Docker -- you can use
-Docker Images as a basis for Singularity Images, or run Docker Images with
+Docker images as a basis for Singularity images, or run Docker images with
Singularity (even without having Docker installed).
+See the :windows-wit:`on Docker ` for installation options.
.. importantnote:: Additional requirement: Singularity
- In order to use Singularity containers you have to
+ To use Singularity containers you have to
`install `_ the software singularity.
+.. index::
+ pair: installation; Docker
+ pair: install Docker; on Windows
+.. find-out-more:: Docker installation Windows
+ :name: ww-docker
+
+ The software singularity is not available for Windows.
+ Windows users therefore need to install :term:`Docker`.
+ The currently recommended way to do so is by installing `Docker Desktop `_, and use its "WSL2" backend (a choice one can set during the installation).
+ In the case of an "outdated WSL kernel version" issue, run ``wsl --update`` in a regular Windows Command Prompt (CMD).
+ After the installation, run Docker Desktop, and wait several minutes for it to start the Docker engine in the background.
+ To verify that everything works as it should, run ``docker ps`` in a Windows Command Prompt (CMD).
+ If it reports an error that asks "Is the docker daemon running?" give it a few more minutes to let Docker Desktop start it.
+ If it can't find the docker command, something went wrong during installation.
+
+.. index::
+ pair: containers-add; DataLad command
+ pair: containers-run; DataLad command
+
Using ``datalad containers``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. index:: ! datalad command; containers-add
-.. index:: ! datalad command; containers-run
-
One core feature of the ``datalad containers`` extension is that it registers
-computational containers to a dataset. This is done with the
+computational containers with a dataset. This is done with the
:dlcmd:`containers-add` command.
Once a container is registered, arbitrary commands can be executed inside of
it, i.e., in the precise software environment the container encapsulates. All it
@@ -122,41 +153,46 @@ section :ref:`run` with the :dlcmd:`containers-run` command.
Let's see this in action for the ``midterm_analysis`` dataset by rerunning
the analysis you did for the midterm project within a Singularity container.
We start by registering a container to the dataset.
-For this, we will pull an Image from Singularity hub. This Image was made
+For this, we will pull an image from Singularity hub. This image was made
for the handbook, and it contains the relevant Python setup for
the analysis. Its recipe lives in the handbook's
`resources repository `_.
-If you're curious how to create a Singularity Image, the hidden
-section below has some pointers:
+If you are curious how to create a Singularity image, the :find-out-more:`on this topic ` has some pointers:
-.. find-out-more:: How to make a Singularity Image
+.. index::
+ pair: build container image; with Singularity
+.. windows-wit:: How to make a Singularity image
+ :name: fom-container-creation
- Singularity containers are build from Image files, often
+ Singularity containers are build from image files, often
called "recipes", that hold a "definition" of the software container and its
contents and components. The
`singularity documentation `_
- has its own tutorial on how to build such Images from scratch.
- An alternative to writing the Image file by hand is to use
+ has its own tutorial on how to build such images from scratch.
+ An alternative to writing the image file by hand is to use
`Neurodocker `_. This
command-line program can help you generate custom Singularity recipes (and
- also ``Dockerfiles``, from which Docker Images are build). A wonderful tutorial
+ also ``Dockerfiles``, from which Docker images are built). A wonderful tutorial
on how to use Neurodocker is
`this introduction `_
by Michael Notter.
Once a recipe exists, the command
- .. code-block:: bash
+ .. code-block:: console
- sudo singularity build
+ $ sudo singularity build
will build a container (called ````) from the recipe. Note that this
command requires ``root`` privileges ("``sudo``"). You can build the container
on any machine, though, not necessarily the one that is later supposed to
actually run the analysis, e.g., your own laptop versus a compute cluster.
+.. index::
+ pair: add container image to dataset; with DataLad
+
The :dlcmd:`containers-add` command takes an arbitrary
-name to give to the container, and a path or url to a container Image:
+name to give to the container, and a path or URL to a container image:
.. runrecord:: _examples/DL-101-133-101
:language: console
@@ -164,84 +200,45 @@ name to give to the container, and a path or url to a container Image:
:cast: 10_yoda
:notes: Computational reproducibility: add a software container
- # we are in the midterm_project subdataset
+ $ # we are in the midterm_project subdataset
$ datalad containers-add midterm-software --url shub://adswa/resources:2
-.. find-out-more:: How do I add an Image from Dockerhub, Amazon ECR, or a local container?
+.. index::
+ pair: hub; Docker
+.. find-out-more:: How do I add an image from Docker-Hub, Amazon ECR, or a local container?
- Should the Image you want to use lie on Dockerhub, specify the ``--url``
+ Should the image you want to use sit on Dockerhub, specify the ``--url``
option prefixed with ``docker://`` or ``dhub://`` instead of ``shub://``:
- .. code-block:: bash
+ .. code-block:: console
- datalad containers-add midterm-software --url docker://adswa/resources:2
+ $ datalad containers-add midterm-software --url docker://adswa/resources:2
- If your Image exists on Amazon ECR, use a ``dhub://`` prefix followed by the AWS ECR URL as in
+ If your image lives on Amazon ECR, use a ``dhub://`` prefix followed by the AWS ECR URL as in
- .. code-block:: bash
+ .. code-block:: console
- datalad containers-add --url dhub://12345678.dkr.ecr.us-west-2.amazonaws.com/maze-code/data-import:latest data-import
+ $ datalad containers-add --url dhub://12345678.dkr.ecr.us-west-2.amazonaws.com/maze-code/data-import:latest data-import
If you want to add a container that exists locally, specify the path to it
like this:
- .. code-block:: bash
+ .. code-block:: console
- datalad containers-add midterm-software --url path/to/container
+ $ datalad containers-add midterm-software --url path/to/container
This command downloaded the container from Singularity Hub, added it to
the ``midterm_project`` dataset, and recorded basic information on the
container under its name "midterm-software" in the dataset's configuration at
``.datalad/config``. You can find out more about them in a dedicated :ref:`find-out-more on these additional configurations `.
+.. index::
+ pair: DataLad concept; container image registration
.. find-out-more:: What changes in .datalad/config when one adds a container?
:name: fom-containerconfig
:float:
- .. runrecord:: _examples/DL-101-133-102
- :language: console
- :workdir: dl-101/DataLad-101/midterm_project
-
- $ cat .datalad/config
-
- This recorded the Image's origin on Singularity-Hub, the location of the
- Image in the dataset under ``.datalad/environments//image``, and it
- specifies the way in which the container should be used: The line
-
- .. code-block:: bash
-
- cmdexec = singularity exec {img} {cmd}
-
- can be read as: "If this container is used, take the ``cmd`` (what you wrap in a
- :dlcmd:`containers-run` command) and plug it into a
- :shcmd:`singularity exec` command. The mode of calling Singularity,
- namely ``exec``, means that the command will be executed inside of the container.
-
- You can configure this call format by modifying it in the config file, or calling :dlcmd:`containers-add` with the option ``--call-fmt ``.
- This can be useful to, for example, automatically bind-mount the current working directory in the container.
- In the alternative call format, the placeholders ``{img}``, ``{cmd}``, and ``{img_dspath}`` (a relative path to the dataset containing the image) are available.
- In all other cases with variables that use curly brackets, you need to escape them with another curly bracket.
- Here is an example call format that bind-mounts the current working directory (and thus the dataset) automatically:
-
- .. code-block:: bash
-
- datalad containers-add --call-fmt 'singularity exec -B {{pwd}} --cleanenv {img} {cmd}'
-
- Note that the Image is saved under ``.datalad/environments`` and the
- configuration is done in ``.datalad/config`` -- as these files are version
- controlled and shared with together with a dataset, your software
- container and the information where it can be re-obtained from are linked
- to your dataset.
-
- This is how the ``containers-add`` command is recorded in your history:
-
- .. runrecord:: _examples/DL-101-133-103
- :language: console
- :workdir: dl-101/DataLad-101/midterm_project
- :cast: 10_yoda
- :notes: The software container got added to your datasets history
-
- $ git log -n 1 -p
+ .. include:: topic/container-imgcfg.rst
Such configurations can, among other things, be important to ensure correct container invocation on specific systems or across systems.
One example is *bind-mounting* directories into containers, i.e., making a specific directory and its contents available inside a container.
@@ -249,6 +246,8 @@ Different containerization software (versions) or configurations of those determ
Thus, depending on the system and the location of the dataset on this system, a shared dataset may be automatically bind-mounted or not.
To ensure that the dataset is correctly bind-mounted on all systems, let's add a call-format specification with a bind-mount to the current working directory following the information in the :ref:`find-out-more on additional container configurations `.
+.. index::
+ single: configuration.item; datalad.containers..cmdexec
.. runrecord:: _examples/DL-101-133-104
:language: console
:workdir: dl-101/DataLad-101/midterm_project
@@ -257,14 +256,18 @@ To ensure that the dataset is correctly bind-mounted on all systems, let's add a
$ git config -f .datalad/config datalad.containers.midterm-software.cmdexec 'singularity exec -B {{pwd}} {img} {cmd}'
$ datalad save -m "Modify the container call format to bind-mount the working directory"
+.. index::
+ pair: run command with provenance capture; with DataLad
+ pair: run command; with DataLad containers-run
+
Now that we have a complete computational environment linked to the ``midterm_project``
-dataset, we can execute commands in this environment. Let us for example try to repeat
+dataset, we can execute commands in this environment. Let us, for example, try to repeat
the :dlcmd:`run` command from the section :ref:`yoda_project` as a
:dlcmd:`containers-run` command.
The previous ``run`` command looked like this:
-.. code-block:: bash
+.. code-block:: console
$ datalad run -m "analyze iris data with classification analysis" \
--input "input/iris.csv" \
@@ -294,13 +297,14 @@ But if your dataset contains more than one container you will *need* to specify
the name of the container you want to use in your command.
The complete command's structure looks like this:
-.. code-block:: bash
+.. code-block:: console
$ datalad containers-run --name [-m ...] [--input ...] [--output ...]
-.. index:: ! datalad command; containers-remove
-.. index:: ! datalad command; containers-list
-
+.. index::
+ pair: containers-remove; DataLad command
+ pair: containers-list; DataLad command
+ pair: list known containers; with DataLad
.. find-out-more:: How can I list available containers or remove them?
The command :dlcmd:`containers-list` will list all containers in
@@ -315,7 +319,7 @@ The complete command's structure looks like this:
The command :dlcmd:`containers-remove` will remove a container
from the dataset, if there exists a container with name given to the
- command. Note that this will remove not only the Image from the dataset,
+ command. Note that this will remove not only the image from the dataset,
but also the configuration for it in ``.datalad/config``.
@@ -331,7 +335,7 @@ Here is how the history entry looks like:
If you would :dlcmd:`rerun` this commit, it would be re-executed in the
software container registered to the dataset. If you would share the dataset
-with a friend and they would :dlcmd:`rerun` this commit, the Image would first
+with a friend and they would :dlcmd:`rerun` this commit, the image would first
be obtained from its registered url, and thus your
friend can obtain the correct execution environment automatically.
@@ -357,7 +361,7 @@ the most recent state of the subdataset to the superdataset ``DataLad-101``.
$ datalad save -d . -m "add container and execute analysis within container" midterm_project
-Software containers, the ``datalad-containers`` extension, and DataLad thus work well together
+Software containers, the ``datalad-container`` extension, and DataLad thus work well together
to make your analysis completely reproducible -- by not only linking code, data,
and outputs, but also the software environment of an analysis. And this does not
only benefit your future self, but also whomever you share your dataset with, as
diff --git a/docs/basics/101-134-summary.rst b/docs/basics/101-134-summary.rst
index 3b03604e1..d34403158 100644
--- a/docs/basics/101-134-summary.rst
+++ b/docs/basics/101-134-summary.rst
@@ -14,7 +14,7 @@ The last two sections have first of all extended your knowledge on dataset nesti
of the subdatasets version state. If you want to record this, you need to
:dlcmd:`save` it in the superdataset:
- .. code-block:: bash
+ .. code-block:: console
$ datalad save -m "a short summary of changes in subds"
@@ -29,8 +29,8 @@ for yourself why and how software containers can go hand-in-hand with DataLad:
`datalad containers `_
can make this possible.
-- The command :dlcmd:`containers-add` registers an Image from a path or
- url to your dataset.
+- The command :dlcmd:`containers-add` registers an :term:`container image` from a path or
+ URL to your dataset.
- If you use :dlcmd:`containers-run` instead of :dlcmd:`run`,
you can reproducibly execute a command of your choice *within* the software
@@ -39,6 +39,9 @@ for yourself why and how software containers can go hand-in-hand with DataLad:
- A :dlcmd:`rerun` of a commit produced with :dlcmd:`containers-run`
will re-execute the command in the same software environment.
+.. index::
+ pair: hub; Docker
+
Now what can I do with it?
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -52,8 +55,8 @@ includes the relevant software environment. This does not only make your analyse
projects automatically reproducible, but automatically *computationally* reproducible -
you can make sure that your analyses runs on any computer with Singularity,
regardless of the software environment on this computer. Even if you are unsure how you can wrap up an
-environment into a software container Image at this point, you could make use of
-hundreds of publicly available Images on `Singularity-Hub `_ and
+environment into a software :term:`container image` at this point, you could make use of
+hundreds of publicly available images on `Singularity-Hub `_ and
`Docker-Hub `_.
With this, you have also gotten a first glimpse into an extension of DataLad: A
diff --git a/docs/basics/101-135-help.rst b/docs/basics/101-135-help.rst
index ab1d969bd..0ea481fce 100644
--- a/docs/basics/101-135-help.rst
+++ b/docs/basics/101-135-help.rst
@@ -7,7 +7,7 @@ All DataLad errors or problems you encounter during ``DataLad-101`` are intentio
and serve illustrative purposes. But what if you run into any DataLad errors
outside of this course?
Fortunately, the syllabus has a whole section on that, and on
-one lazy, warm summer-afternoon you flip through it.
+one lazy, warm summer afternoon you flip through it.
.. figure:: ../artwork/src/reading.svg
:width: 50%
@@ -45,7 +45,9 @@ To find out which version you are using, run
$ datalad --version
-.. index:: ! datalad command; wtf
+.. index::
+ pair: wtf; DataLad command
+ pair: get system information; with DataLad
If you want a comprehensive overview of your full setup,
:dlcmd:`wtf` [#f1]_ is the command to turn to. Running this command will
@@ -92,6 +94,12 @@ list of command arguments with details on their possibilities and
requirements. A first thing to check would be whether your command call
specified all of the required arguments.
+An additional source of information is the `PsyInf knowledge base
+`_. It contains a curated
+collection of solutions and workarounds that have not yet made it into other
+documentation.
+
+
Asking questions (right)
^^^^^^^^^^^^^^^^^^^^^^^^
@@ -106,15 +114,15 @@ Include
- *context* -- what did you want to do and why?
-- the *problem* -- paste the error message (all of it), and provide the
- steps necessary to reproduce it.
+- the *problem* -- post the error message, and provide the
+ steps necessary to reproduce it. Do not shorten the error message, unless it contains sensitive information.
- *technical details* -- what version of DataLad are you using, what version
of git-annex, and which git-annex repository type, what is your operating
system and -- if applicable -- Python version? :dlcmd:`wtf` is your friend
to find all of this information.
-.. index:: Debugging
+.. index:: debugging
Debugging like a DataLad-developer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -130,12 +138,13 @@ It is not always straightforward to see *why* a particular DataLad command has f
Given that operations with DataLad can be quite complicated, and could involve complexities such as different forms of authentication, different file systems, interactions with the environment, configurations, and other software, and *much* more, there are what may feel like an infinite amount of sources for the problem at hand.
The resulting error message, however, may not display the underlying cause correctly because the error message of whichever process failed is not propagated into the final result report.
-In situations where there is no obvious reason for a command to fail, it can be helpful -- either for yourself or for further information to paste into :term:`GitHub` issues -- to start `debugging `_, or *logging at a higher granularity* than is the default.
+In situations where there is no obvious reason for a command to fail, it can be helpful -- either for yourself or for further information to include in :term:`GitHub` issues -- to start `debugging `_, or *logging at a higher granularity* than is the default.
This allows you to gain more insights into the actions DataLad and its underlying tools are taking, where *exactly* they fail, and to even play around with the program at the state of the failure.
:term:`Debugging` and :term:`logging` are not as complex as these terms may sound if you have never consciously debugged.
-Procedurally, it can be as easy as adding an additional flag to a command call, and cognitively, it can be as easy as engaging your visual system in a visual search task for the color red or the word "error", or reading more DataLad output than you're used to.
-The paragraphs below start with the general concepts, and collect concrete debugging strategies for different problems.
+
+Procedurally, it can be as easy as adding an additional flag to a command call, and cognitively, it can be as easy as engaging your visual system in a visual search task for the color red or the word "error", or reading more DataLad output than you are used to.
+We will start with the general concepts, and then collect concrete debugging strategies for different problems.
.. _logging:
@@ -153,6 +162,9 @@ Anything printed to your terminal preceded by ``[INFO]``, for example, is a log
When you are *consciously* logging, you simply set the log-level to the desired amount of information, or increase the amount of verbosity until the output gives you a hint of what went wrong.
Likewise, adjusting the log-level also works the other way around, and lets you *decrease* the amount of information you receive in your terminal.
+
+.. index::
+ pair: log level; DataLad concept
.. find-out-more:: Log levels
Log levels provide the means to adjust how much information you want, and are described in human readable terms, ordered by the severity of the failures or problems reported.
@@ -169,7 +181,6 @@ Likewise, adjusting the log-level also works the other way around, and lets you
Raising the log level (e.g, to ``error``, or ``40``) will decrease the amount of information and output you will receive, while lowering it (e.g., to ``debug`` or ``10``) will increase it.
-
Setting a log level can be done in the form of an :term:`environment variable`, a configuration, or with the ``-l``/``--log-level`` flag appended directly after the main :shcmd:`datalad` command.
To get extensive information on what :dlcmd:`status` does underneath the hood, your command could look like this (but its output is shortened):
@@ -180,19 +191,22 @@ To get extensive information on what :dlcmd:`status` does underneath the hood, y
$ datalad --log-level debug status
+.. index::
+ single: configuration item; datalad.log.level
+ pair: configure verbosity of command output; with DataLad
.. find-out-more:: ... and how does it look when using environment variables or configurations?
The log level can also be set (for different scopes) using the ``datalad.log.level`` configuration variable, or the corresponding environment variable ``DATALAD_LOG_LEVEL``.
To set the log level for a single command, for example, set it in front of the command:
- .. code-block:: bash
+ .. code-block:: console
$ DATALAD_LOG_LEVEL=debug datalad status
And to set the log level for the rest of the shell session, export it:
- .. code-block:: bash
+ .. code-block:: console
$ export DATALAD_LOG_LEVEL=debug
$ datalad status
@@ -202,7 +216,7 @@ To get extensive information on what :dlcmd:`status` does underneath the hood, y
The configuration variable can be used to set the log level on a user (global) or system-wide level with the :gitcmd:`config` command:
- .. code-block:: bash
+ .. code-block:: console
$ git config --global datalad.log.level debug
@@ -258,7 +272,7 @@ Output produced by Git
If you have not configured your Git identity, you will
see warnings like this when running any DataLad command:
-.. code-block:: bash
+.. code-block:: console
[WARNING] It is highly recommended to configure git first (set both user.name and user.email) before using DataLad.
@@ -271,7 +285,7 @@ One error you can run into when publishing dataset contents is that your
:dlcmd:`push` to a sibling is rejected.
One example is this:
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to public
[ERROR ] refs/heads/main->public:refs/heads/main [rejected] (non-fast-forward) [publish(/home/me/dl-101/DataLad-101)]
@@ -286,7 +300,7 @@ know about. It can be fixed by updating from the sibling first with a
Here is a different push rejection:
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to roommate
publish(ok): . (dataset) [refs/heads/git-annex->roommate:refs/heads/git-annex 023a541..59a6f8d]
@@ -301,7 +315,7 @@ In this particular case, this is because it was an attempt to push from ``DataLa
to the ``roommate`` sibling that was created in chapter :ref:`chapter_collaboration`.
This is a special case of pushing, because it -- in technical terms -- is a push
to a non-bare repository. Unlike :term:`bare Git repositories`, non-bare
-repositories can not be pushed to at all times. To fix this, you either want to
+repositories cannot be pushed to at all times. To fix this, you either want to
checkout another branch
in the ``roommate`` sibling or push to a non-checked out branch in the ``roommate``
sibling. Alternatively, you can configure ``roommate`` to receive the push with
@@ -317,7 +331,7 @@ on ``receive.denyCurrentBranch`` for more.
One warning that you may encounter during an installation of a dataset is:
-.. code-block:: bash
+.. code-block:: console
[INFO ] Submodule HEAD got detached. Resetting branch main to point to 046713bb. Original location was 47e53498
@@ -334,7 +348,7 @@ Output produced by git-annex
Upon installation of a dataset, you may see:
-.. code-block:: bash
+.. code-block:: console
[INFO ] Remote origin not usable by git-annex; setting annex-ignore
[INFO ] This could be a problem with the git-annex installation on the
@@ -348,9 +362,9 @@ many reasons, but as long as there are other remotes you can access the
data from, you are fine.
A similar warning message may appear when adding a sibling that is a pure Git
-:term:`remote`, for example a repository on GitHub:
+:term:`remote`, such as a repository on GitHub:
-.. code-block:: bash
+.. code-block:: console
[INFO ] Failed to enable annex remote github, could be a pure git or not
accessible
@@ -359,7 +373,7 @@ A similar warning message may appear when adding a sibling that is a pure Git
by mistake due to absent connection etc
These messages indicate that the sibling ``github`` does not carry an annex.
-Thus, annexed file contents can not be pushed to this sibling. This happens
+Thus, annexed file contents cannot be pushed to this sibling. This happens
if the sibling indeed does not have an annex (which would be true, for example,
for siblings on :term:`GitHub`, :term:`GitLab`, :term:`Bitbucket`, ..., and
would not require any further action or worry), or
@@ -377,7 +391,7 @@ Other errors
Sometimes, registered subdatasets URLs have an :term:`SSH` instead of :term:`https` address, for example ``git@github.com:datalad-datasets/longnow-podcasts.git`` instead of ``https://github.com/datalad-datasets/longnow-podcasts.git``.
If one does not have an SSH key configured for the required service (e.g., GitHub, or a server), installing or getting the subdataset and its contents fails, with messages starting similar to this:
-.. code-block:: bash
+.. code-block:: console
[INFO ] Cloning https://github.com/psychoinformatics-de/paper-remodnav.git/remodnav [2 other candidates] into '/home/.../remodnav'
Permission denied (publickey).
diff --git a/docs/basics/101-135-intro.rst b/docs/basics/101-135-intro.rst
index 362cd2f90..20ab5edc9 100644
--- a/docs/basics/101-135-intro.rst
+++ b/docs/basics/101-135-intro.rst
@@ -10,7 +10,7 @@ appreciate the pre-crafted examples and tasks the handbook provides.
different errors and know many caveats and principles already, but I certainly
will mess something up at one point. How can I get help, or use the history of
the dataset to undo what I screwed up? Also, I'm not sure whether I know what I
-can and can not do with the files inside of my dataset... What if I would
+can and cannot do with the files inside of my dataset... What if I would
like to remove one, for example?”
Therefore, this upcoming chapter is a series of tutorials about common
diff --git a/docs/basics/101-136-cheatsheet.rst b/docs/basics/101-136-cheatsheet.rst
index 2af2cb75c..28b2ecaed 100644
--- a/docs/basics/101-136-cheatsheet.rst
+++ b/docs/basics/101-136-cheatsheet.rst
@@ -5,11 +5,9 @@
DataLad cheat sheet
-------------------
-.. index:: ! Cheatsheet
-
.. only:: html
- Click on the image below to obtain a PDF version of the cheat sheet. Individual
+ Click on the image to obtain a PDF version of the cheat sheet. Individual
sections are linked to chapters or technical docs.
.. figure:: ../artwork/src/datalad-cheatsheet_p1.png
diff --git a/docs/basics/101-136-filesystem.rst b/docs/basics/101-136-filesystem.rst
index 0b502e393..5530f8510 100644
--- a/docs/basics/101-136-filesystem.rst
+++ b/docs/basics/101-136-filesystem.rst
@@ -1,4 +1,4 @@
-.. _filesystem:
+.. _file system:
Miscellaneous file system operations
------------------------------------
@@ -26,11 +26,14 @@ Below you will find common questions about file system
management operations, and each question outlines caveats and
solutions with code examples you can paste into your own terminal.
Because these code snippets will add many commits to your
-dataset, we're cleaning up within each segment with
-common git operations that manipulate the datasets
+dataset, we are cleaning up within each segment with
+common Git operations that manipulate the datasets
history -- be sure to execute these commands as well (and
be sure to be in the correct dataset).
+.. index::
+ pair: rename file; with DataLad
+
Renaming files
^^^^^^^^^^^^^^
@@ -102,15 +105,17 @@ new and the deleted file as a path specification to
save a change that is marked as a deletion in a
:dlcmd:`status`:
-.. code-block:: bash
+.. code-block:: console
- datalad save -m "rename file" oldname newname
+ $ datalad save -m "rename file" oldname newname
Alternatively, there is also a way to save the name change
-only using Git tools only, outlined in the following hidden
-section. If you are a Git user, you will be very familiar with it.
+only using Git tools only, outlined in the :find-out-more:`on faster renaming `. If you are a Git user, you will be very familiar with it.
+.. index::
+ pair: rename file; with Git
.. find-out-more:: Faster renaming with Git tools
+ :name: fom-gitmv
Git has built-in commands that provide a solution in two steps.
@@ -125,7 +130,7 @@ section. If you are a Git user, you will be very familiar with it.
$ git reset --hard HEAD~1
$ datalad status
- Now we're checking out how to rename files and commit this operation
+ Now we are checking out how to rename files and commit this operation
using only Git:
A Git-specific way to rename files is the ``git mv`` command:
@@ -249,6 +254,8 @@ Therefore, in general, whenever moving or renaming a file,
especially between directories, a ``datalad save`` is
the best option to turn to.
+.. index::
+ pair: content pointer file; git-annex concept
.. find-out-more:: Why a move between directories is actually a content change
Let's see how this shows up in the dataset history:
@@ -269,7 +276,9 @@ the best option to turn to.
move plus a content change for Git.
-.. gitusernote:: git annex fix
+.. index::
+ pair: fix; git-annex command
+.. gitusernote:: 'datalad save' internals: 'git annex fix'
A :dlcmd:`save` command internally uses a :gitcmd:`commit` to save changes to a dataset.
:gitcmd:`commit` in turn triggers a :gitannexcmd:`fix`
@@ -287,6 +296,9 @@ Finally, let's clean up:
$ git reset --hard HEAD~1
+.. index::
+ pair: move file to other dataset; with DataLad
+
Moving files across dataset boundaries
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Generally speaking, moving files across dataset hierarchies is not advised.
@@ -346,10 +358,10 @@ as if the file was generated at once, instead of successively over the course:
:notes: clean-up
:cast: 03_git_annex_basics
- # in midterm_project
+ $ # in midterm_project
$ git reset --hard HEAD~
- # in DataLad-101
+ $ # in DataLad-101
$ cd ../
$ git reset --hard HEAD~
@@ -442,7 +454,7 @@ convenient. It can be a confusing and potentially "file-content-losing"-dangerou
process, but it also dissociates a file from its provenance that is captured
in its previous dataset, with no machine-readable way to learn about the move
easily. A better alternative may be copying files with the :dlcmd:`copy-file`
-command introduced in detail in the web version, and demonstrated in the next
+command introduced in detail in the online-handbook, and demonstrated in the next
but one paragraph. Let's quickly clean up by moving the file back:
.. runrecord:: _examples/DL-101-136-137
@@ -450,7 +462,7 @@ but one paragraph. Let's quickly clean up by moving the file back:
:workdir: dl-101/DataLad-101/midterm_project
:notes: move file back
- # in midterm_project
+ $ # in midterm_project
$ git annex unannex TLCL.pdf
.. runrecord:: _examples/DL-101-136-138
@@ -500,6 +512,8 @@ file. Let's save it:
That's it.
+.. index::
+ pair: content pointer file; git-annex concept
.. find-out-more:: Symlinks!
If you have read the additional content in the section
@@ -526,7 +540,7 @@ That's it.
In most cases, this is just an interesting fun-fact, but beware
when dropping content with :dlcmd:`drop`
- (:ref:`remove`):
+ as outlined in :ref:`remove`:
If you drop the content of one copy of a file, all
other copies will lose this content as well.
@@ -542,6 +556,9 @@ Finally, let's clean up:
.. _copyfileFS:
+.. index::
+ pair: copy file to other dataset; with DataLad
+
Copying files across dataset boundaries
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -575,6 +592,7 @@ was not configured with the ``text2git`` :term:`run procedure`.
.. runrecord:: _examples/DL-101-136-147
:language: console
:workdir: dl-101/DataLad-101
+ :emphasize-lines: 3, 10
$ tree midterm_project
@@ -601,7 +619,7 @@ provenance record is lost:
Nevertheless, copying files with :dlcmd:`copy-file` is easier and safer
than moving them with standard Unix commands, especially so for annexed files.
A more detailed introduction to :dlcmd:`copy-file` and a concrete
-usecase can be found in the online version of the handbook.
+use case can be found in the online-handbook.
Let's clean up:
@@ -621,7 +639,7 @@ the Unix :shcmd:`mv` command to move or rename, and the :dlcmd:`save`
to clean up afterwards, just as in the examples above. Make sure to
**not** use ``git mv``, especially for subdatasets.
-Let's for example rename the ``books`` directory:
+Let's, for example, rename the ``books`` directory:
.. runrecord:: _examples/DL-101-136-151
:language: console
@@ -740,107 +758,14 @@ use.
Beware of one thing though: If your dataset either is a sibling
or has a sibling with the source being a path, moving or renaming
the dataset will break the linkage between the datasets. This can
-be fixed easily though. We can try this in the following hidden
-section.
+be fixed easily though. We can try this in the :find-out-more:`on adjusting sibling URLs `.
+.. index::
+ pair: move subdataset; with Git
.. find-out-more:: If a renamed/moved dataset is a sibling...
+ :name: fom-adjust-sibling-urls
- As section :ref:`config` explains, each
- sibling is registered in ``.git/config`` in a "submodule" section.
- Let's look at how our sibling "roommate" is registered there:
-
- .. runrecord:: _examples/DL-101-136-160
- :language: console
- :workdir: dl-101/DataLad-101
- :emphasize-lines: 18-19
-
- $ cat .git/config
-
- As you can see, its "url" is specified as a relative path. Say your
- room mate's directory is a dataset you would want to move. Let's see
- what happens if we move the dataset such that the path does not point
- to the dataset anymore:
-
- .. runrecord:: _examples/DL-101-136-161
- :language: console
- :workdir: dl-101/DataLad-101
-
- # add an intermediate directory
- $ cd ../mock_user
- $ mkdir onemoredir
- # move your room mates dataset into this new directory
- $ mv DataLad-101 onemoredir
-
- This means that relative to your ``DataLad-101``, your room mates
- dataset is not at ``../mock_user/DataLad-101`` anymore, but in
- ``../mock_user/onemoredir/DataLad-101``. The path specified in
- the configuration file is thus wrong now.
-
- .. runrecord:: _examples/DL-101-136-162
- :language: console
- :exitcode: 1
- :workdir: dl-101/mock_user
-
- # navigate back into your dataset
- $ cd ../DataLad-101
- # attempt a datalad update
- $ datalad update
-
- Here we go:
-
- .. code-block:: text
-
- 'fatal: '../mock_user/DataLad-101' does not appear to be a git repository
- fatal: Could not read from remote repository.
-
- Git seems pretty insistent (given the amount of error messages) that
- it can not seem to find a Git repository at the location the ``.git/config``
- file specified. Luckily, we can provide this information. Edit the file with
- an editor of your choice and fix the path from
- ``url = ../mock_user/DataLad-101`` to
- ``url = ../mock_user/onemoredir/DataLad-101``.
-
- Below, we are using the stream editor `sed `_
- for this operation.
-
- .. runrecord:: _examples/DL-101-136-163
- :language: console
- :workdir: dl-101/DataLad-101
-
- $ sed -i 's/..\/mock_user\/DataLad-101/..\/mock_user\/onemoredir\/DataLad-101/' .git/config
-
- This is how the file looks now:
-
- .. runrecord:: _examples/DL-101-136-164
- :language: console
- :workdir: dl-101/DataLad-101
-
- $ cat .git/config
-
- Let's try to update now:
-
- .. runrecord:: _examples/DL-101-136-165
- :workdir: dl-101/DataLad-101
- :language: console
-
- $ datalad update
-
- Nice! We fixed it!
- Therefore, if a dataset you move or rename is known to other
- datasets from its path, or identifies siblings with paths,
- make sure to adjust them in the ``.git/config`` file.
-
- To clean up, we'll redo the move of the dataset and the
- modification in ``.git/config``.
-
- .. runrecord:: _examples/DL-101-136-166
- :language: console
- :workdir: dl-101/DataLad-101
-
- $ cd ../mock_user && mv onemoredir/DataLad-101 .
- $ rm -r onemoredir
- $ cd ../DataLad-101 && sed -i 's/..\/mock_user\/onemoredir\/DataLad-101/..\/mock_user\/DataLad-101/' .git/config
-
+ .. include:: topic/moved-sibling-path-fix.rst
Getting contents out of git-annex
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -855,10 +780,10 @@ annexed file into Git.
Consider you intend to share the cropped ``.png`` images you created from the
``longnow`` logos. Would you publish your ``DataLad-101`` dataset so :term:`GitHub`
or :term:`GitLab`, these files would not be available to others, because annexed
-dataset contents can not be published to these services.
+dataset contents cannot be published to these services.
Even though you could find a third party service of your choice
-and publish your dataset *and* the annexed data (see section :ref:`sharethirdparty`),
-you're feeling lazy today. And since it
+and publish your dataset *and* the annexed data as described in :ref:`sharethirdparty`,
+you are feeling lazy today. And since it
is only two files, and they are quite small, you decide to store them in Git --
this way, the files would be available without configuring an external data
store.
@@ -873,13 +798,13 @@ works:
$ git annex unannex recordings/*logo_small.jpg
-Your dataset's history records the unannexing of the files.
+Your dataset notices the unannexing of the files as follows.
.. runrecord:: _examples/DL-101-136-168
:language: console
:workdir: dl-101/DataLad-101
- $ git log -p -n 1
+ $ git status
Once files have been unannexed, they are "untracked" again, and you can save them
into Git, either by adding a rule to ``.gitattributes``, or with
@@ -904,7 +829,7 @@ In case you want to get all annexed contents out of a Dataset at once, you could
It is a command that can be used to stop using git annex entirely in a given repository/dataset.
Running this command will unannex every file in the repository, remove all of git-annex's other data, and remove the :term:`git-annex` branch, leaving you with a normal Git repository plus the previously annexed files.
-Note a ``datalad push`` will reinstate the git-annex branch IF your dataset has siblings that still contain the annex branch.
+Note a ``datalad push`` will reinstate the git-annex branch *if* your dataset has siblings that still contain the annex branch.
Deleting (annexed) files/directories
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -928,7 +853,7 @@ by going back into the history of the dataset or reverting the removal commit:
:notes: 2 ways to remove a file from dataset: remove the file from the current state of the repository (the *worktree*) but keeping the content in the history, or remove content entirely from a dataset and its history.
:cast: 03_git_annex_basics
- # download a file
+ $ # download a file
$ datalad download-url -m "Added flower mosaic from wikimedia" \
https://upload.wikimedia.org/wikipedia/commons/a/a5/Flower_poster_2.jpg \
--path flowers.jpg
@@ -940,7 +865,7 @@ by going back into the history of the dataset or reverting the removal commit:
:cast: 03_git_annex_basics
- # removal is easy:
+ $ # removal is easy:
$ rm flowers.jpg
This will lead to a dirty dataset status:
@@ -981,13 +906,13 @@ If this commit is reverted, the file comes back to existence:
In other words, with an :shcmd:`rm` and subsequent :dlcmd:`save`,
the symlink is removed, but the content is retained in the history.
+.. index::
+ pair: drop; DataLad command
.. _remove:
Removing annexed content entirely
"""""""""""""""""""""""""""""""""
-.. index:: ! datalad command; drop
-
The command to remove file content entirely and irreversibly from a repository is
the :dlcmd:`drop` command.
This command will delete the content stored in the annex of the dataset,
@@ -1031,7 +956,7 @@ remaining symlink will fail, but the content can be obtained easily again with
$ datalad get flowers.jpg
If a file has no verified remote copies, DataLad will only drop its
-content if the user enforces it using the ``--reckless [MODE]`` option, where ``[MODE]`` is either ``modification`` (drop despite unsaved modifications) ``availability`` (drop even though no other copy is known) ``undead`` (only for datasets; would drop a dataset without announcing its death to linked dataset clones) or ``kill`` (no safety checks at all are run).
+content if the user enforces it using the ``--reckless [MODE]`` option, where ``[MODE]`` is either ``modification`` (drop despite unsaved modifications), ``availability`` (drop even though no other copy is known), ``undead`` (only for datasets; would drop a dataset without announcing its death to linked dataset clones) or ``kill`` (no safety checks at all are run).
We will demonstrate this by generating an empty file:
.. runrecord:: _examples/DL-101-136-177
@@ -1043,13 +968,13 @@ We will demonstrate this by generating an empty file:
$ dd if=/dev/zero | head -c 18520 > a.pdf
$ datalad save -m "add some file" a.pdf
-DataLad will safeguard dropping content that it can not retrieve again:
+DataLad will safeguard dropping content that it cannot retrieve again:
.. runrecord:: _examples/DL-101-136-178
:workdir: dl-101/DataLad-101
:language: console
:exitcode: 1
- :notes: datalad does not know how to re-obtain the file, so it complains
+ :notes: datalad does not know how to reobtain the file, so it complains
:cast: 03_git_annex_basics
$ datalad drop a.pdf
@@ -1098,25 +1023,21 @@ private :term:`SSH key`\s or passwords, or too many or too large files are
accidentally saved into Git, and *need* to get out of the dataset history.
The command ``git-filter-repo --force`` will "filter-out",
i.e., remove all files **but the ones specified** in ````
-from the dataset's history. An advanced chapter in the online version of the handbook
+from the dataset's history. An advanced chapter in the online-handbook
shows an example invocation.
+.. index::
+ pair: drop; DataLad command
+
Uninstalling or deleting subdatasets
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. index:: ! datalad command; uninstall
-
Depending on the exact aim, different commands are of relevance for
-deleting a DataLad subdataset. The softer (and not so much "deleting" version)
-is to uninstall a dataset with a :dlcmd:`drop`.
+deleting a DataLad subdataset.
+One way to uninstall a dataset is the :dlcmd:`drop` command.
To work on datasets, ``drop`` needs to be parametrized with ``--what all``.
If needed, add ``--recursive`` in case the dataset contains subdatasets, and a
-fitting ``--reckless`` mode.
-A stand-alone command, :dlcmd:`uninstall`, wraps a ``datalad drop --what all --reckless kill``.
-This command can be used to uninstall any number of
-*subdatasets*. Note though that only subdatasets can be uninstalled; the command
-will error if given a sub-*directory*, a file, or a top-level dataset.
-Using the :dlcmd:`drop` command gives you greater flexibility.
+fitting ``--reckless`` mode, such as ``datalad drop --what all --reckless kill --recursive``.
.. runrecord:: _examples/DL-101-136-181
:language: console
@@ -1124,7 +1045,7 @@ Using the :dlcmd:`drop` command gives you greater flexibility.
:notes: To get rid of subdatasets one can either uninstall or remove them. let's clone one to see:
:cast: 03_git_annex_basics
- # clone a subdataset - the content is irrelevant, so why not a cloud :)
+ $ # clone a subdataset - the content is irrelevant, so why not a cloud :)
$ datalad clone -d . \
https://github.com/datalad-datasets/disneyanimation-cloud.git \
cloud
@@ -1134,15 +1055,16 @@ To uninstall the dataset, you can use
.. runrecord:: _examples/DL-101-136-182
:language: console
:workdir: dl-101/DataLad-101
- :notes: uninstall uninstalls the dataset, but it is still registered in the superdataset. a dl install will get the dataset again!
+ :notes: uninstall drop the dataset, but it is still registered in the superdataset. a dl install will get the dataset again!
:cast: 03_git_annex_basics
- $ datalad uninstall cloud
+ $ datalad drop --what all --reckless kill --recursive cloud
Note that the dataset is still known in the dataset, and not completely removed.
A ``datalad get [-n/--no-data] cloud`` would install the dataset again.
-.. index:: ! datalad command; remove
+.. index::
+ pair: remove; DataLad command
In case one wants to fully delete a subdataset from a dataset, the
:dlcmd:`remove` command is relevant [#f3]_.
@@ -1166,11 +1088,11 @@ subsequently remove it with the :dlcmd:`remove` command:
:notes: to completely remove the dataset, use datalad remove
:cast: 03_git_annex_basics
- # delete the subdataset
+ $ # delete the subdataset
$ datalad remove -m "remove obsolete subds" -d . cloud
Note that for both commands a pointer to the *current directory* will not work.
-``datalad remove .`` or ``datalad uninstall .`` will fail, even if
+``datalad remove .`` or ``datalad drop .`` will fail, even if
the command is executed in a subdataset instead of the top-level
superdataset -- you need to execute the command from a higher-level directory.
@@ -1182,10 +1104,10 @@ If for whatever reason you at one point tried to remove a DataLad dataset,
whether with a GUI or the command line call ``rm -rf ``, you likely
have seen permission denied errors such as
-.. code-block: bash
+.. code-block:: console
- rm: cannot remove '/.git/annex/objects/Mz/M1/MD5E-s422982--2977b5c6ea32de1f98689bc42613aac7.jpg/MD5E-s422982--2977b5c6ea32de1f98689bc42613aac7.jpg': Permission denied
- rm: cannot remove '/.git/annex/objects/FP/wv/MD5E-s543180--6209797211280fc0a95196b0f781311e.jpg/MD5E-s543180--6209797211280fc0a95196b0f781311e.jpg': Permission denied
+ rm: cannot remove '/.git/annex/objects/Mz/M1/MD5E-s422982--2977b5c6ea32de1f98689bc42613aac7.jpg/MD5E-s422982--2977b5c6ea32de1f98689bc42613aac7.jpg': Permission denied
+ rm: cannot remove '/.git/annex/objects/FP/wv/MD5E-s543180--6209797211280fc0a95196b0f781311e.jpg/MD5E-s543180--6209797211280fc0a95196b0f781311e.jpg': Permission denied
[...]
This error indicates that there is write-protected content within ``.git`` that
@@ -1196,22 +1118,21 @@ to protect the file content given to it. To remove a dataset with annexed conten
one has to regain write permissions to everything in the dataset. This is done
with the Unix ``chmod`` command:
-.. code-block:: bash
+.. code-block:: console
- chmod -R u+w
+ $ chmod -R u+w
This *recursively* (``-R``, i.e., throughout all files and (sub)directories) gives users
(``u``) write permissions (``+w``) for the dataset.
Afterwards, ``rm -rf `` will succeed.
-However, instead of ``rm -rf``, a faster way to remove a dataset is using
-:dlcmd:`remove`: Run ``datalad remove `` outside of the
+However, instead of ``rm -rf``, a faster way to remove a dataset is using either :dlcmd:`drop` or :dlcmd:`remove`: Run ``datalad drop -d `` or ``datalad remove -d `` outside of the
superdataset to remove a top-level dataset with all its contents. Likely,
both ``--recursive`` and ``--reckless [availability|undead|kill]`` flags are necessary
to traverse into subdatasets and to remove content that does not have verified remotes.
-Be aware though that both ways to delete a dataset will
+Be aware, though, that deleting a dataset in which ever way will
irretrievably delete the dataset, it's contents, and it's history.
Summary
diff --git a/docs/basics/101-137-history.rst b/docs/basics/101-137-history.rst
index c7c5c456b..da7382892 100644
--- a/docs/basics/101-137-history.rst
+++ b/docs/basics/101-137-history.rst
@@ -6,7 +6,7 @@ Back and forth in time
Almost everyone inadvertently deleted or overwrote files at some point with
a hasty operation that caused data fatalities or at least troubles to
-re-obtain or restore data.
+reobtain or restore data.
With DataLad, no mistakes are forever: One powerful feature of datasets
is the ability to revert data to a previous state and thus view earlier content or
correct mistakes. As long as the content was version controlled (i.e., tracked),
@@ -50,7 +50,7 @@ done. For the rest of the lecture, call me Google!"
Fixing (empty) commit messages
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-From the back of the lecture hall comes a question you're really glad
+From the back of the lecture hall comes a question you are really glad
someone asked: "It has happened to me that I accidentally did a
:dlcmd:`save` and forgot to specify the commit message,
how can I fix this?".
@@ -119,6 +119,22 @@ Try running the :gitcmd:`commit --amend` command right now and give
the commit a new commit message (you can just delete the one created by
DataLad in the editor)!
+.. index::
+ pair: save --amend; DataLad command
+ pair: add changes to previous commit; with DataLad
+ pair: change the last commit message; with DataLad
+.. gitusernote:: 'git commit --amend' versus 'datalad save --amend'
+
+ Similar to ``git commit``, ``datalad save`` also has an ``--amend`` option.
+ Like its Git equivalent, it can be used to record changes not in a new, separate commit, but integrate them with the previously saved state.
+ Though this has not been the use case for ``git commit --amend`` here, experienced Git users will be accustomed to using ``git commit --amend`` to achieve something similar in their Git workflows.
+ In contrast to ``git commit --amend``, ``datalad save --amend`` will not open up an interactive editor to potentially change a commit message (unless the configuration ``datalad.save.no-message`` is set to ``interactive``), but a new commit message can be supplied with the ``-m``/``--message`` option.
+
+
+.. index::
+ pair: change historical commit messages; with Git
+ pair: rebase; Git command
+ pair: rewrite history; with Git
.. find-out-more:: Changing the commit messages of not-the-most-recent commits
:name: fom-rebase1
:float:
@@ -129,12 +145,12 @@ DataLad in the editor)!
can do so during a so-called "interactive rebase". The command
for this is
- .. code-block:: bash
+ .. code-block:: console
$ git rebase -i HEAD~N
where ``N`` specifies how far back you want to rewrite commits.
- ``git rebase -i HEAD~3`` for example lets you apply changes to the
+ ``git rebase -i HEAD~3``, for example, lets you apply changes to the
any number of commit messages within the last three commits.
Be aware that an interactive rebase lets you *rewrite* history.
@@ -188,6 +204,9 @@ DataLad in the editor)!
But be careful not to delete any lines in the above editor view --
**An interactive rebase can be dangerous, and if you remove a line, this commit will be lost!**
+.. index::
+ pair: stop content tracking; with Git
+
Untracking accidentally saved contents (tracked in Git)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -218,9 +237,9 @@ undo commits by resetting the history of a dataset to an earlier version.
exact behavior it, but the relevant one for this aim is ``--mixed`` [#f3]_.
Specifying the command:
-.. code-block:: bash
+.. code-block:: console
- git reset --mixed COMMIT
+ $ git reset --mixed COMMIT
will preserve all changes made to files since the specified
commit in the dataset but remove them from the dataset's history.
@@ -234,6 +253,9 @@ try this to get a feel for it.
The COMMIT in the command can either be a hash or a reference
with the HEAD pointer.
+.. index::
+ pair: branch; Git concept
+ pair: HEAD; Git concept
.. find-out-more:: Git terminology: branches and HEADs?
A Git repository (and thus any DataLad dataset) is built up as a tree of
@@ -252,7 +274,7 @@ with the HEAD pointer.
`less frequently used and of importance primarily in the case of merge
commits `__.
-Let's stay with the hash, and reset to the commit prior to saving the Gitjokes.
+Let's stay with the hash, and reset to the commit prior to saving the Git jokes.
First, find out the shasum, and afterwards, reset it.
@@ -313,6 +335,9 @@ Finally, let's check how the history looks afterwards:
Wow! You have rewritten history [#f4]_!
+.. index::
+ pair: stop content tracking; with git-annex
+
Untracking accidentally saved contents (stored in git-annex)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -337,9 +362,9 @@ gets annexed with an accidental :dlcmd:`save`:
:language: console
:workdir: dl-101/DataLad-101
- # create an empty pdf file
+ $ # create an empty pdf file
$ convert xc:none -page Letter apdffile.pdf
- # accidentally save it
+ $ # accidentally save it
$ datalad save
This accidental :dlcmd:`save` has thus added both text files
@@ -488,12 +513,15 @@ prior :gitcmd:`checkout` (note that the output is shortened for brevity and show
:lines: 2, 48-
:realcommand: echo "$ git cat-file --textconv $(git rev-parse HEAD~15):notes.txt" && git cat-file --textconv $(git rev-parse HEAD~15):notes.txt
+.. index::
+ pair: cat-file; Git command
+
The cat-file command is very versatile, and
`it's documentation `_ will list all
of its functionality. To use it to see the contents of a file at a previous
state as done above, this is how the general structure looks like:
-.. code-block:: bash
+.. code-block:: console
$ git cat-file --textconv SHASUM:
@@ -619,7 +647,10 @@ under which situations and how to perform such an interactive rebase.
However, outlining an interactive rebase here in the handbook could lead to
problems for readers without (much) Git experience: An interactive rebase,
even if performed successfully, can lead to many problems if it is applied with
-too little experience, for example in any collaborative real-world project.
+too little experience, for example, in any collaborative real-world project.
+
+.. index::
+ pair: revert; Git command
Instead, we demonstrate a different, less intrusive way to revert one or more
changes at any point in the history of a dataset: the :gitcmd:`revert`
@@ -629,18 +660,20 @@ the changes of an unwanted commit are reverted.
The command looks like this:
-.. code-block:: bash
+.. code-block:: console
$ git revert SHASUM
where ``SHASUM`` specifies the commit hash of the modification that should
be reverted.
+.. index::
+ pair: revert multiple commit; with Git
.. find-out-more:: Reverting more than a single commit
You can also specify a range of commits like this:
- .. code-block:: bash
+ .. code-block:: console
$ git revert OLDER_SHASUM..NEWERSHASUM
@@ -652,7 +685,7 @@ be reverted.
reversal commits. If you however want the reversal of a range of commits
saved in a single commit, supply the ``--no-commit`` option as in
- .. code-block:: bash
+ .. code-block:: console
$ git revert --no-commit OLDER_SHASUM..NEWERSHASUM
@@ -706,6 +739,9 @@ As you can see, unsurprisingly, the :gitcmd:`revert` command had no
effects on anything else but the specified commit, and previously untracked
files are still present.
+.. index::
+ pair: resolve merge conflict; with Git
+
Oh no! I'm in a merge conflict!
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -745,7 +781,7 @@ To conclude this section, let's remove all untracked contents from
the dataset. This can be done with :gitcmd:`clean`: The command
:gitcmd:`clean -f` swipes your dataset clean and removes any untracked
file.
-**Careful! This is not revertible, and content lost with this commands can not be recovered!**
+**Careful! This is not revertible, and content lost with this commands cannot be recovered!**
If you want to be extra sure, run :gitcmd:`clean -fn` beforehand -- this will
give you a list of the files that would be deleted.
@@ -784,10 +820,10 @@ to remove the ``Gitjoke2.txt`` file.
.. [#f1] For example, the :dlcmd:`rerun` command introduced in section
:ref:`run2` takes such a hash as an argument, and re-executes
the ``datalad run`` or ``datalad rerun`` :term:`run record` associated with
- this hash. Likewise, the :gitcmd:`diff` can work with commit hashes.
+ this hash. Likewise, the :gitcmd:`diff` command can work with commit hashes.
.. [#f2] There are other alternatives to reference commits in the history of a dataset,
- for example "counting" ancestors of the most recent commit using the notation
+ for example, "counting" ancestors of the most recent commit using the notation
``HEAD~2``, ``HEAD^2`` or ``HEAD@{2}``. However, using hashes to reference
commits is a very fail-save method and saves you from accidentally miscounting.
diff --git a/docs/basics/101-138-sharethirdparty.rst b/docs/basics/101-138-sharethirdparty.rst
index cf713bbbe..9026d7192 100644
--- a/docs/basics/101-138-sharethirdparty.rst
+++ b/docs/basics/101-138-sharethirdparty.rst
@@ -11,21 +11,19 @@ Data sharing potentially involves a number of different elements:
An overview of all elements potentially included in a publication workflow.
Users on a common, shared computational infrastructure such as an :term:`SSH server`
-can share datasets via simple installations with paths, without any involvement of third party storage providers or repository hosting services:
+can share datasets via simple installations with paths, without any involvement of third party storage providers or repository hosting services, as shown in :numref:`fig-clonecompute`.
-|pic1| |pic2|
+.. _fig-clonecompute:
-.. |pic1| image:: ../artwork/src/publishing/clone_local.svg
- :width: 45%
+.. figure:: ../artwork/src/publishing/clone_combined.svg
-.. |pic2| image:: ../artwork/src/publishing/clone_server.svg
- :width: 45%
+ Cloning from local or remote compute infrastructure.
But at some point in a dataset's life, you may want to share it with people that
can't access the computer or server your dataset lives on, store it on other infrastructure
to save diskspace, or create a backup.
When this happens, you will want to publish your dataset to repository hosting
-services (for example :term:`GitHub`, :term:`GitLab`, or :term:`Gin`)
+services (for example, :term:`GitHub`, :term:`GitLab`, or :term:`GIN`)
and/or third party storage providers (such as Dropbox_, Google_,
`Amazon S3 buckets `_,
the `Open Science Framework`_ (OSF), and many others).
@@ -56,7 +54,7 @@ Leveraging third party infrastructure
There are several ways to make datasets available for others:
-- You can **publish your dataset to a repository with annex support** such as :term:`Gin` or the OSF_ [#f1]_. This is the easiest way to share datasets and all their contents. Read on in the section :ref:`gin` or consult the tutorials of the `datalad-osf extension`_ to learn how to do this.
+- You can **publish your dataset to a repository with annex support** such as :term:`GIN` or the OSF_ [#f1]_. This is the easiest way to share datasets and all their contents. Read on in the section :ref:`gin` or consult the tutorials of the `datalad-osf extension`_ to learn how to do this.
- You can **publish your dataset to a repository hosting service**, and **configure an external resource that stores your annexed data**. Such a resource can be a private web server, but also a third party services cloud storage such as Dropbox_, Google_, `Amazon S3 buckets `_, `Box.com `_, `owncloud `_, `sciebo `_, or many more.
@@ -70,7 +68,7 @@ Dataset contents and third party services influence sharing
Because DataLad datasets are :term:`Git` repositories, it is possible to
:dlcmd:`push` datasets to any Git repository hosting service, such as
-:term:`GitHub`, :term:`GitLab`, :term:`Gin`, :term:`Bitbucket`, `Gogs `_,
+:term:`GitHub`, :term:`GitLab`, :term:`GIN`, :term:`Bitbucket`, `Gogs `_,
or Gitea_.
You have already done this in section :ref:`yoda_project` when you shared your ``midterm_project`` dataset via :term:`GitHub`.
@@ -100,21 +98,23 @@ The common case: Repository hosting without annex support and special remotes
Because DataLad datasets are :term:`Git` repositories, it is possible to
:dlcmd:`push` datasets to any Git repository hosting service, such as
-:term:`GitHub`, :term:`GitLab`, :term:`Gin`, :term:`Bitbucket`, `Gogs `_,
+:term:`GitHub`, :term:`GitLab`, :term:`GIN`, :term:`Bitbucket`, `Gogs `_,
or Gitea_.
But while anything that is managed by Git is accessible in repository hosting services, they usually don't support storing annexed data [#f2]_.
When you want to publish a dataset to a Git repository hosting service to allow others to easily find and clone it, but you also want others to be able to retrieve annexed files in this dataset via :dlcmd:`get`, annexed contents need to be pushed to additional storage hosting services.
The hosting services can be all kinds of private, institutional, or commercial services, and their location will be registered in the dataset under the concept of a :term:`special remote`.
+.. index::
+ pair: special remote; git-annex concept
.. find-out-more:: What is a special remote
A special-remote is an extension to Git’s concept of remotes, and can
enable :term:`git-annex` to transfer data from and possibly to places that are not Git
repositories (e.g., cloud services or external machines such as an HPC
system). For example, an *s3* special remote uploads and downloads content
- to AWS S3, a *web* special remote downloads files from the web, and *datalad-archive*
- extracts files from the annexed archives, etc. Don’t envision a special-remote
+ to AWS S3, a *web* special remote downloads files from the web, the *datalad-archive* special remote
+ extracts files from annexed archives, etc. Don’t envision a special-remote
as merely a physical place or location – a special-remote is a protocol that
defines the underlying transport of your files to and/or from a specific location.
@@ -150,11 +150,15 @@ installing/setting up the relevant *special-remote*, obtaining your dataset and
data is as easy as with any public DataLad dataset.
While you have to invest some setup effort in the beginning, once this
is done, the workflows of yours and others are the same that you are already
-very familiar with.
+very familiar with, as :numref:`fig-cloneurls` illustrates.
+
+.. _fig-cloneurls:
.. figure:: ../artwork/src/publishing/clone_url.svg
:width: 60%
+ Cloning from remote URLs.
+
If you are interested in learning how to set up different services as special remotes, you can take a look at the sections :ref:`s3`, :ref:`dropbox` or :ref:`gitlfs` for concrete examples with DataLad datasets, and the general section :ref:`share_hostingservice` on setting up dataset siblings.
In addition, there are step-by-step walk-throughs in the documentation of git-annex for services such as `S3 `_, `Google Cloud Storage `_,
@@ -168,13 +172,17 @@ Here is the complete list: `git-annex.branchable.com/special_remotes `_ repository to your machine (do not clone it into ``DataLad-101`` but somewhere else on your computer), and copy the path to this repository into your ``$PATH`` variable. If you
clone into ``/home/user-bob/repos``, the command would look like this [#f3]_:
- .. code-block:: bash
+ .. code-block:: console
$ git clone https://github.com/DanielDent/git-annex-remote-rclone.git
$ export PATH="/home/user-bob/repos/git-annex-remote-rclone:$PATH"
@@ -124,7 +124,7 @@ from a large number of commercial providers [#f2]_.
- Finally, in the dataset you want to share, run the :gitannexcmd:`initremote` command.
Give the remote a name (it is ``dropbox-for-friends`` here), and specify the name of the remote you configured with ``rclone`` with the ``target`` parameters:
-.. code-block:: bash
+.. code-block:: console
$ git annex initremote dropbox-for-friends type=external externaltype=rclone chunk=50MiB encryption=none target=dropbox-for-friends prefix=my_awesome_dataset
@@ -135,7 +135,7 @@ What has happened up to this point is that we have configured Dropbox
as a third-party storage service for the annexed contents in the dataset.
On a conceptual, dataset level, your Dropbox folder is now a :term:`sibling` -- the sibling name is the first positional argument after ``initremote``, i.e., "dropbox-for-friends":
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
.: here(+) [git]
@@ -159,7 +159,8 @@ be managed and accessed via DataLad/git-annex.
To actually share your dataset with someone, you need to *publish* it to Github,
Gitlab, or a similar hosting service.
-.. index:: ! datalad command; create-sibling-github
+.. index::
+ pair: create-sibling-github; DataLad command
You could, for example, create a sibling of the ``DataLad-101`` dataset
on GitHub with the command :dlcmd:`create-sibling-github`.
@@ -171,7 +172,7 @@ However, in order to be able to link the contents stored in Dropbox, you also ne
configure a *publication dependency* to the ``dropbox-for-friends`` sibling -- this is
done with the ``publish-depends `` option.
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-github -d . DataLad-101 \
--publish-depends dropbox-for-friends
@@ -181,7 +182,7 @@ done with the ``publish-depends `` option.
:dlcmd:`siblings` will again list all available siblings:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
.: here(+) [git]
@@ -199,13 +200,15 @@ publishing to GitHub dependent on the ``dropbox-for-friends`` sibling
(that has a remote data annex), so that annexed contents are published
there first.
+.. index::
+ pair: publication dependency; DataLad concept
.. importantnote:: Publication dependencies are strictly local configuration
Note that the publication dependency is only established for your own dataset,
it is not shared with clones of the dataset. Internally, this configuration
is a key value pair in the section of your remote in ``.git/config``:
- .. code-block:: bash
+ .. code-block:: ini
[remote "github"]
annex-ignore = true
@@ -216,7 +219,7 @@ there first.
With this setup, we can publish the dataset to GitHub. Note how the publication
dependency is served first:
-.. code-block:: bash
+.. code-block:: console
:emphasize-lines: 2
$ datalad push --to github
@@ -251,7 +254,7 @@ have to do:
If the repository is on GitHub, a :dlcmd:`clone` with the URL
will install the dataset:
-.. code-block:: bash
+.. code-block:: console
$ datalad clone https://github.com//DataLad-101.git
[INFO ] Cloning https://github.com//DataLad-101.git [1 other candidates] into '/Users/awagner/Documents/DataLad-101'
@@ -262,7 +265,7 @@ will install the dataset:
Pay attention to one crucial information in this output:
-.. code-block:: bash
+.. code-block:: console
[INFO ] access to 1 dataset sibling dropbox-for-friends not auto-enabled, enable with:
| datalad siblings -d "/Users//Documents/DataLad-101" enable -s dropbox-for-friends
@@ -284,16 +287,16 @@ the same as before:
After this is done, you can execute what DataLad's output message suggests
to "enable" this special remote (inside of the installed ``DataLad-101``):
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings -d "/Users/awagner/Documents/DataLad-101" \
enable -s dropbox-for-friends
.: dropbox-for-friends(?) [git]
-And once this is done, you can get any annexed file contents, for example the
+And once this is done, you can get any annexed file contents, for example, the
books, or the cropped logos from chapter :ref:`chapter_run`:
-.. code-block:: bash
+.. code-block:: console
$ datalad get books/TLCL.pdf
get(ok): /home/some/other/user/DataLad-101/books/TLCL.pdf (file) [from dropbox-for-friends]
diff --git a/docs/basics/101-139-figshare.rst b/docs/basics/101-139-figshare.rst
index d4a64cb15..662a10370 100644
--- a/docs/basics/101-139-figshare.rst
+++ b/docs/basics/101-139-figshare.rst
@@ -8,7 +8,7 @@ annexed content to a variety of third party infrastructure, DataLad also has
some built-in support for "exporting" data to other services.
This usually means that a static snapshot of your dataset and its files are shared
in archives or collections of files.
-While an export of a dataset looses some of the advantages that a DataLad dataset has, for example a transparent version history, it can be a fast and simple way to make the most recent version of your dataset available or archived.
+While an export of a dataset looses some of the advantages that a DataLad dataset has, for example, a transparent version history, it can be a fast and simple way to make the most recent version of your dataset available or archived.
One example is the command :dlcmd:`export-archive`.
Running this command creates a ``.tar.gz`` file with the content of your dataset.
@@ -21,7 +21,7 @@ Running :dlcmd:`export-to-figshare` allows you to publish the dataset as a snaps
Note that this requires a free account on Figshare, and the generation of an `access token `_ for authentication.
An interactive prompt will ask you to supply authentication credentials, and guide you through the process of creating a new article.
-.. code-block:: bash
+.. code-block:: console
:emphasize-lines: 5
$ datalad export-to-figshare
@@ -45,17 +45,20 @@ An interactive prompt will ask you to supply authentication credentials, and gui
-The screenshot below shows how the ``DataLad-101`` dataset looks like in exported form:
+The screenshot in :numref:`figfigshare` shows how the ``DataLad-101`` dataset looks like in exported form:
+.. _figfigshare:
.. figure:: ../artwork/src/figshare_screenshot.png
:width: 50%
+ The dataset export on Figshare
+
You could then extend the dataset with metadata, obtain a `DOI `_ for it and make it citable, and point others to it in order to download it as an archive of files.
Beyond this, as the command :dlcmd:`export-archive` is used by it to prepare content for upload to Figshare, annexed files also will be annotated as available from the archive on Figshare using ``datalad-archive`` special remote.
As a result, if you publish your Figshare dataset and share your DataLad dataset on a repository hosting service without support for annexed files, users will still be able to fetch content from the tarball shared on Figshare.
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
.: here(+) [git]
diff --git a/docs/basics/101-139-gin.rst b/docs/basics/101-139-gin.rst
index 3b57858a4..e717ab7fe 100644
--- a/docs/basics/101-139-gin.rst
+++ b/docs/basics/101-139-gin.rst
@@ -11,12 +11,12 @@ fine-grained access control to share data. :term:`GIN` builds up on :term:`Git`
and share your DataLad datasets [#f1]_. It allows to share datasets and their
contents with selected collaborators or making them publicly and anonymously
available.
-:ref:`And even if you prefer to expose and share your datasets via GitHub, you can still use Gin to host your data `.
+:ref:`And even if you prefer to expose and share your datasets via GitHub, you can still use GIN to host your data `.
.. figure:: ../artwork/src/publishing/publishing_network_publishgin.svg
:width: 80%
- Some repository hosting services such as Gin have annex support, and can thus hold the complete dataset. This makes publishing datasets very easy.
+ Some repository hosting services such as GIN have annex support, and can thus hold the complete dataset. This makes publishing datasets very easy.
Prerequisites
^^^^^^^^^^^^^
@@ -44,19 +44,20 @@ You should copy the contents of your public key file into the field labeled
"My private work station". Afterwards, you are done!
+.. index::
+ pair: create-sibling-gin; DataLad command
+
Publishing your dataset to GIN
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. index:: ! datalad command; create-sibling-gin
-
-As outlined in the section :ref:`share_hostingservice`, there are two ways in which you can publish your dataset to Gin.
+As outlined in the section :ref:`share_hostingservice`, there are two ways in which you can publish your dataset to GIN.
Either by 1) creating a new, empty repository on GIN via the web interface, or 2) via the :dlcmd:`create-sibling-gin` command.
-**1) via webinterface:** If you choose to create a new repository via Gin's web interface, make sure to not initialize it with a README:
+**1) via webinterface:** If you choose to create a new repository via GIN's web interface, make sure to not initialize it with a README:
.. figure:: ../artwork/src/GIN_newrepo.png
- Create a new repository on Gin using the web interface.
+ Create a new repository on GIN using the web interface.
Afterwards, add this repository as a sibling of your dataset. To do this, use the
:dlcmd:`siblings add` command and the SSH URL of the repository as shown below.
@@ -105,7 +106,7 @@ What is especially cool is that the GIN web interface (unlike :term:`GitHub`) ca
.. figure:: ../artwork/src/GIN_dl101_repo.png
- A published dataset in a Gin repository at gin.g-node.org.
+ A published dataset in a GIN repository at gin.g-node.org.
.. _access:
@@ -116,19 +117,19 @@ Once your dataset is published, you can point collaborators and friends to it.
If it is a **public** repository, retrieving the dataset and getting access to
all published data contents (in a read-only fashion) is done by cloning the
-repository's ``https`` url. This does not require a user account on Gin.
+repository's ``https`` url. This does not require a user account on GIN.
+.. index::
+ pair: clone; DataLad command
.. importantnote:: Take the URL in the browser, not the copy-paste URL
- .. index:: ! datalad command; clone
-
Please note that you need to use the browser URL of the repository, not the copy-paste URL on the upper right hand side of the repository if you want to get anonymous HTTPS access!
The two URLs differ only by a ``.git`` extension:
* Browser bar: ``https://gin.g-node.org//``
* Copy-paste "HTTPS clone": ``https://gin.g-node.org//.git``
- A dataset cloned from ``https://gin.g-node.org//.git``, however, can not retrieve annexed files!
+ A dataset cloned from ``https://gin.g-node.org//.git``, however, cannot retrieve annexed files!
.. runrecord:: _examples/DL-101-139-107
:language: console
@@ -139,9 +140,10 @@ repository's ``https`` url. This does not require a user account on Gin.
Subsequently, :dlcmd:`get` calls will be able to retrieve all annexed
file contents that have been published to the repository.
-.. index:: ! datalad command; clone
+.. index::
+ pair: clone; DataLad command
-If it is a **private** dataset, cloning the dataset from Gin requires a user
+If it is a **private** dataset, cloning the dataset from GIN requires a user
name and password for anyone you want to share your dataset with.
The "Collaboration" tab under Settings lets you set fine-grained access rights,
and it is possible to share datasets with collaborators that are not registered
@@ -150,13 +152,14 @@ If you are unsure if your dataset is private, :ref:`this find-out-more shows you
In order to get access to annexed contents, cloning *requires* setting up
an SSH key as detailed above, and cloning via the SSH url:
-.. code-block:: bash
+.. code-block:: console
$ datalad clone git@gin.g-node.org:/adswa/DataLad-101.git
-Likewise, in order to publish changes back to a Gin repository, the repository needs
+Likewise, in order to publish changes back to a GIN repository, the repository needs
to be cloned via its SSH url.
+.. index:: dataset hosting; GIN
.. find-out-more:: How do I know if my repository is private?
:name: fom-private-gin
:float:
@@ -164,11 +167,15 @@ to be cloned via its SSH url.
Private repos are marked with a lock sign. To make it public, untick the
"Private" box, found under "Settings":
- .. image:: ../artwork/src/GIN_private.png
+ ..
+ the image below can't become a figure because it can't be used in LaTeXs minipage environment
+ .. image:: ../artwork/src/GIN_private.png
+.. index::
+ pair: subdatasets; DataLad command
.. _subdspublishing:
Subdataset publishing
@@ -184,7 +191,7 @@ you click on it, you would get to a 404 Error page. The crucial difference betwe
subdataset and the longnow dataset is its entry in the ``.gitmodules`` file of
``DataLad-101``:
-.. code-block:: bash
+.. code-block:: ini
:emphasize-lines: 4, 8
$ cat .gitmodules
@@ -246,23 +253,26 @@ the problem:
If the subdataset was not published before, you could publish the subdataset to
a location of your choice, and modify the ``.gitmodules`` entry accordingly.
+.. index::
+ single: configuration item; remote..annex-ignore
+ pair: configure sibling; with DataLad
.. _ginbts:
-Using Gin as a data source behind the scenes
+Using GIN as a data source behind the scenes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Even if you do not want to point collaborators to yet another hosting site but want to be able to expose your datasets via services they use and know already (such as GitHub or GitLab), Gin can be very useful:
-You can let Gin perform data hosting in the background by using it as an "autoenabled data source" that a dataset :term:`sibling` (even if it is published to GitHub or GitLab) can retrieve data from.
-You will need to have a Gin account and SSH key setup, so please take a look at the first part of this section if you do not yet know how to do this.
+Even if you do not want to point collaborators to yet another hosting site but want to be able to expose your datasets via services they use and know already (such as GitHub or GitLab), GIN can be very useful:
+You can let GIN perform data hosting in the background by using it as an "autoenabled data source" that a dataset :term:`sibling` (even if it is published to GitHub or GitLab) can retrieve data from.
+You will need to have a GIN account and SSH key setup, so please take a look at the first part of this section if you do not yet know how to do this.
Then, follow these steps:
-- First, create a new repository on Gin (see step by step instructions above).
+- First, create a new repository on GIN (see step by step instructions above).
- In your to-be-published dataset, add this repository as a sibling, this time setting `--url` and `--pushurl` arguments explicitly. Make sure to configure a :term:`SSH` URL as a ``--pushurl`` but a :term:`HTTPS` URL as a ``url``.
Please also note that the :term:`HTTPS` URL written after ``--url`` DOES NOT have the ``.git`` suffix.
Here is the command:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings add \
-d . \
@@ -271,21 +281,23 @@ Then, follow these steps:
--url https://gin.g-node.org/studyforrest/aggregate-fmri-timeseries \
- Locally, run ``git config --unset-all remote.gin.annex-ignore`` to prevent :term:`git-annex` from ignoring this new dataset
-- Push your data to the repository on Gin (``datalad push --to gin``). This pushes the actual state of the repository, including content, but also adjusts the :term:`git-annex` configuration.
+- Push your data to the repository on GIN (``datalad push --to gin``). This pushes the actual state of the repository, including content, but also adjusts the :term:`git-annex` configuration.
- Configure this sibling as a "common data source". Use the same name as previously in ``--name`` (to indicate which sibling you are configuring) and give a new, different, name after ``--as-common-datasrc``:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings configure \
--name gin \
--as-common-datasrc gin-src
-- Push to the repository on Gin again (``datalad push --to gin``) to make the configuration change known to the Gin sibling.
+- Push to the repository on GIN again (``datalad push --to gin``) to make the configuration change known to the Gin sibling.
- Publish your dataset to GitHub/GitLab/..., or update an existing published dataset (``datalad push``)
-Afterwards, :dlcmd:`get` retrieves files from Gin, even if the dataset has been cloned from GitHub.
+Afterwards, :dlcmd:`get` retrieves files from GIN, even if the dataset has been cloned from GitHub.
+.. index::
+ pair: common data source; DataLad concept
.. gitusernote:: Siblings as a common data source
The argument ``as-common-datasrc `` configures a sibling as a common data source -- in technical terms, as an auto-enabled git-annex special remote.
@@ -298,9 +310,9 @@ Afterwards, :dlcmd:`get` retrieves files from Gin, even if the dataset has been
and `client `_ are
useful tools with a variety of features that are worthwhile to check out, as well.
-.. [#f3] Alternatively, you can configure the siblings url with :gitcmd:`config`:
+.. [#f3] Alternatively, you can configure the siblings URL with :gitcmd:`config`:
- .. code-block:: bash
+ .. code-block:: console
$ git config -f .gitmodules --replace-all submodule.midterm_project.url https://github.com/adswa/midtermproject
diff --git a/docs/basics/101-139-gitlfs.rst b/docs/basics/101-139-gitlfs.rst
index ae8eb5d68..4afdfc505 100644
--- a/docs/basics/101-139-gitlfs.rst
+++ b/docs/basics/101-139-gitlfs.rst
@@ -4,14 +4,14 @@ Walk-through: Git LFS as a special remote on GitHub
---------------------------------------------------
Some repository hosting services provide for-pay support for large files, and can thus be used as special remotes as well.
-GitHub and GitLab for example support `Git Large File Storage `_ (Git LFS) for managing data files using Git.
+GitHub and GitLab, for example, support `Git Large File Storage `_ (Git LFS) for managing data files using Git.
A free GitHub subscription allows up to `1GB of free storage and up to 1GB of bandwidth monthly `_.
As such, it might be sufficient for some use cases, and could be configured
quite easily.
In order to store annexed dataset contents on GitHub, we need first to create a repository on GitHub:
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-github test-github-lfs --access-protocol ssh
.: github(-) [git@github.com:yarikoptic/test-github-lfs.git (git)]
@@ -19,7 +19,7 @@ In order to store annexed dataset contents on GitHub, we need first to create a
and then initialize a :term:`special remote` of type ``git-lfs``, pointing to the same GitHub repository:
-.. code-block:: bash
+.. code-block:: console
$ git annex initremote github-lfs type=git-lfs url=git@github.com:yarikoptic/test-github-lfs autoenable=true encryption=none embedcreds=no
@@ -27,13 +27,13 @@ If you would like to compress data in Git LFS, you need to take a detour via
encryption during :gitannexcmd:`initremote` -- this has compression as a
convenient side effect. Here is an example initialization:
-.. code-block:: bash
+.. code-block:: console
$ git annex initremote --force github-lfs type=git-lfs url=git@github.com:yarikoptic/test-github-lfs autoenable=true encryption=shared
With this single step it becomes possible to transfer contents to GitHub:
-.. code-block:: bash
+.. code-block:: console
$ git annex copy --to=github-lfs file.dat
copy file.dat (to github-lfs...)
@@ -42,7 +42,7 @@ With this single step it becomes possible to transfer contents to GitHub:
and the entire dataset to the same GitHub repository:
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to=github
[INFO ] Publishing to github
@@ -50,14 +50,16 @@ and the entire dataset to the same GitHub repository:
Alternatively, to make publication even easier for you, the dataset provider, you can establish a :term:`publication dependency` such that a :dlcmd:`push` performs the data transfer to ``git-lfs`` automatically:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings configure -s github --publish-depends github-lfs
- # afterwards, only datalad push is needed to publish dataset contents and history
+ $ # afterwards, only datalad push is needed to publish dataset contents and history
$ datalad push --to github
-Consumers of your dataset should be able to retrieve files right after cloning the dataset without a ``siblings enable`` command (as shown in the section :ref:`dropbox`), because of the ``autoenable=true`` configuration for the special remote.
+Consumers of your dataset should be able to retrieve files right after cloning the dataset without a ``siblings enable`` command, as shown in section :ref:`dropbox`, because of the ``autoenable=true`` configuration for the special remote.
+.. index::
+ pair: drop (LFS); with DataLad
.. importantnote:: No drop from LFS
Unfortunately, it is impossible to :dlcmd:`drop` contents from Git LFS:
diff --git a/docs/basics/101-139-hostingservices.rst b/docs/basics/101-139-hostingservices.rst
index f6ae7acd6..2afd2374d 100644
--- a/docs/basics/101-139-hostingservices.rst
+++ b/docs/basics/101-139-hostingservices.rst
@@ -16,15 +16,15 @@ As outlined in a number of sections before, Git repository hosting sites typical
Depending on whether or not an annex is supported, you can push either only your Git history to the sibling, or the complete dataset including annexed file contents.
You can find out whether a sibling on a remote hosting services carries an annex or not by running the :dlcmd:`siblings` command.
A ``+``, ``-``, or ``?`` sign in parenthesis indicates whether the sibling carries an annex, does not carry an annex, or whether this information isn't yet known.
-In the example below you can see that a public GitHub repository ``_ does not carry an annex on ``github`` (the sibling ``origin``), but that the annexed data are served from an additional sibling ``mddatasrc`` (a :term:`special remote` with annex support).
+In the example below you can see that the public GitHub repository `github.com/psychoinformatics-de/studyforrest-data-phase2 `_ does not carry an annex on GitHub (the sibling ``origin``), but that the annexed data are served from an additional sibling ``mddatasrc`` (a :term:`special remote` with annex support).
Even though the dataset sibling on GitHub does not serve the data, it constitutes a simple, findable access point to retrieve the dataset, and can be used to provide updates and fixes via :term:`pull request`\s, issues, etc.
-.. code-block:: bash
+.. code-block:: console
- # a clone of github/psychoinformatics/studyforrest-data-phase2 has the following siblings:
+ $ # a clone of github/psychoinformatics/studyforrest-data-phase2 has the following siblings:
$ datalad siblings
.: here(+) [git]
- .: mddatasrc(+) [http://psydata.ovgu.de/studyforrest/phase2/.git (git)]
+ .: mddatasrc(+) [https://datapub.fz-juelich.de/studyforrest/studyforrest/phase2/.git (git)]
.: origin(-) [git@github.com:psychoinformatics-de/studyforrest-data-phase2.git (git)]
@@ -34,23 +34,39 @@ How to add a sibling on a Git repository hosting site: The manual way
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#. Create a new repository via the webinterface of the hosting service of your choice.
- It does not need to have the same name as your local dataset, but it helps to associate local dataset and remote siblings.
+#. Create a new repository via the webinterface of the hosting service of your choice. The screenshots in :numref:`fig-newrepogin` and :numref:`fig-newrepogithub` show examples of this.
+ The new repository does not need to have the same name as your local dataset, but it helps to associate local dataset and remote siblings.
+
+#. Afterwards, copy the :term:`SSH` or :term:`HTTPS` URL of the repository. Usually, repository hosting services will provide you with a convenient way to copy it to your clipboard. An SSH URL takes the form ``git@://.git`` and an HTTPS URL takes the form ``https:////.git``. The type of URL you choose determines whether and how you will be able to ``push`` to your repository. Note that many services will require you to use the SSH URL to your repository in order to do :dlcmd:`push` operations, so make sure to take the :term:`SSH` and not the :term:`HTTPS` URL if this is the case.
+
+#. If you pick the :term:`SSH` URL, make sure to have an :term:`SSH key` set up. This usually requires generating an SSH key pair if you do not have one yet, and uploading the public key to the repository hosting service. The :find-out-more:`on SSH keys ` points to a useful tutorial for this.
+
+#. Use the URL to add the repository as a sibling. There are two commands that allow you to do that; both require you give the sibling a name of your choice (common name choices are ``upstream``, or a short-cut for your user name or the hosting platform, but its completely up to you to decide):
+
+ #. ``git remote add ``
+ #. ``datalad siblings add --dataset . --name --url ``
+
+#. Push your dataset to the new sibling: ``datalad push --to ``
+
+
+.. _fig-newrepogin:
.. figure:: ../artwork/src/GIN_newrepo.png
+ :width: 80%
Webinterface of :term:`GIN` during the creation of a new repository.
-.. figure:: ../artwork/src/newrepo-github.png
- Webinterface of :term:`GitHub` during the creation of a new repository.
+.. _fig-newrepogithub:
-#. Afterwards, copy the :term:`SSH` or :term:`HTTPS` URL of the repository. Usually, repository hosting services will provide you with a convenient way to copy it to your clipboard. An SSH URL takes the form ``git@://.git`` and an HTTPS URL takes the form ``https:////.git``. The type of URL you choose determines whether and how you will be able to ``push`` to your repository. Note that many services will require you to use the SSH URL to your repository in order to do :dlcmd:`push` operations, so make sure to take the :term:`SSH` and not the :term:`HTTPS` URL if this is the case.
+.. figure:: ../artwork/src/newrepo-github.png
+ :width: 80%
-#. If you pick the :term:`SSH` URL, make sure to have an :term:`SSH key` set up. This usually requires generating an SSH key pair if you do not have one yet, and uploading the public key to the repository hosting service.
+ Webinterface of :term:`GitHub` during the creation of a new repository.
-.. _sshkey:
+.. index:: concepts; SSH key, SSH; key
+.. _sshkey:
.. find-out-more:: What is an SSH key and how can I create one?
:name: fom-sshkey
@@ -84,15 +100,6 @@ How to add a sibling on a Git repository hosting site: The manual way
to handle this passphrase for you with a single command. How to do all of this
is detailed in the tutorial.
-
-#. Use the URL to add the repository as a sibling. There are two commands that allow you to do that; both require you give the sibling a name of your choice (common name choices are ``upstream``, or a short-cut for your user name or the hosting platform, but its completely up to you to decide):
-
- #. ``git remote add ``
- #. ``datalad siblings add --dataset . --name --url ``
-
-#. Push your dataset to the new sibling: ``datalad push --to ``
-
-
How to add a sibling on a Git repository hosting site: The automated way
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -104,17 +111,17 @@ Each command is slightly tuned towards the peculiarities of each particular plat
- ``[REPONAME]`` (required): The name of the repository on the hosting site. It will be created under a user's namespace, unless this argument includes an organization name prefix. For example, ``datalad create-sibling-github my-awesome-repo`` will create a new repository under ``github.com//my-awesome-repo``, while ``datalad create-sibling-github /my-awesome-repo`` will create a new repository of this name under the GitHub organization ```` (given appropriate permissions).
- ``-s/--name `` (required): A name under which the sibling is identified. By default, it will be based on or similar to the hosting site. For example, the sibling created with ``datalad create-sibling-github`` will be called ``github`` by default.
-- ``--credential `` (optional): Credentials used for authentication are stored internally by DataLad under specific names. These names allow you to have multiple credentials, and flexibly decide which one to use. When ``--credential `` is the name of an existing credential, DataLad tries to authenticate with the specified credential; when it does not yet exist DataLad will prompt interactively for a credential, such as an access token, and store it under the given ```` for future authentications. By default, DataLad will name a credential according to the hosting service URL it used for, for example ``datalad-api.github.com`` as the default for credentials used to authenticate against GitHub.
+- ``--credential `` (optional): Credentials used for authentication are stored internally by DataLad under specific names. These names allow you to have multiple credentials, and flexibly decide which one to use. When ``--credential `` is the name of an existing credential, DataLad tries to authenticate with the specified credential; when it does not yet exist DataLad will prompt interactively for a credential, such as an access token, and store it under the given ```` for future authentications. By default, DataLad will name a credential according to the hosting service URL it used for, such as ``datalad-api.github.com`` as the default for credentials used to authenticate against GitHub.
- ``--access-protocol {https|ssh|https-ssh}`` (default ``https``): Whether to use :term:`SSH` or :term:`HTTPS` URLs, or a hybrid version in which HTTPS is used to *pull* and SSH is used to *push*. Using :term:`SSH` URLs requires an :term:`SSH key` setup, but is a very convenient authentication method, especially when pushing updates -- which would need manual input on user name and token with every ``push`` over HTTPS.
- ``--dry-run`` (optional): With this flag set, the command will not actually create the target repository, but only perform tests for name collisions and report repository name(s).
- ``--private`` (optional): A switch that, if set, makes sure that the created repository is private.
-Other streamlined arguments, such as ``--recursive`` or ``--publish-depends`` allow you to perform more complex configurations, for example publication of dataset hierarchies or connections to :term:`special remote`\s. Upcoming walk-throughs will demonstrate them.
+Other streamlined arguments, such as ``--recursive`` or ``--publish-depends`` allow you to perform more complex configurations, such as publication of dataset hierarchies or connections to :term:`special remote`\s. Upcoming walk-throughs will demonstrate them.
Self-hosted repository services, e.g., Gogs or Gitea instances, have an additional required argument, the ``--api`` flag.
It needs to point to the URL of the instance, for example
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-gogs my_repo_on_gogs --api "https://try.gogs.io"
@@ -136,6 +143,7 @@ Personal access tokens are an alternative to authenticating via your password, a
If you are prompted for ``username`` and ``password`` in the command line, you would enter your token in place of the ``password`` [#f3]_.
Note that you do not have to type your token at every authentication -- your token will be stored on your system the first time you have used it and automatically reused whenever relevant.
+.. index:: credential; storage
.. find-out-more:: How does the authentication storage work?
Passwords, user names, tokens, or any other login information is stored in
@@ -149,9 +157,12 @@ Which permissions do they need?
"""""""""""""""""""""""""""""""
The most convenient way to generate tokens is typically via the webinterface of the hosting service of your choice.
-Often, you can specifically select which set of permissions a specific token has in a drop-down menu similar (but likely not identical) to this screenshot from GitHub:
+Often, you can specifically select which set of permissions a specific token has in a drop-down menu similar (but likely not identical) to the screenshot from GitHub in :numref:`fig-token`.
+
+.. _fig-token:
.. figure:: ../artwork/src/github-token.png
+ :width: 80%
Webinterface to generate an authentication token on GitHub. One typically has to set a name and
permission set, and potentially an expiration date.
@@ -160,7 +171,7 @@ For creating and updating repositories with DataLad commands it is usually suffi
However, broader permission sets may also make sense.
Should you employ GitHub workflows, for example, a token without "workflow" scope could not push changes to workflow files, resulting in errors like this one:
-.. code-block:: bash
+.. code-block:: console
[remote rejected] (refusing to allow a Personal Access Token to create or update workflow `.github/workflows/benchmarks.yml` without `workflow` scope)]
@@ -180,7 +191,7 @@ This configuration file is typically called ``.python-gitlab.cfg`` and placed in
It contains one section per GitLab instance, and a ``[global]`` section that defines the default instance to use.
Here is an example:
-.. code-block:: bash
+.. code-block:: console
$ cat ~/.python-gitlab.cfg
[global]
@@ -199,9 +210,15 @@ Here is an example:
private_token =
Once this configuration is in place, ``create-sibling-gitlab``'s ``--site`` parameter can be supplied with the name of the instance you want to use (e.g., ``datalad create-sibling-gitlab --site gitlab-general``).
-Ensure that the token for each instance has appropriate permissions to create new groups and projects under your user account using the GitLab API.
+Ensure that the token for each instance has appropriate permissions to create new groups and projects under your user account using the GitLab API in :numref:`fig-gitlabtoken`.
+
+.. _fig-gitlabtoken:
.. figure:: ../artwork/src/gitlab-token.png
+ :width: 80%
+
+ Webinterface to generate an authentication token on GitLab. One typically has to set a name and
+ permission set, and potentially an expiration date.
Step 2: Create or select a group
""""""""""""""""""""""""""""""""
@@ -210,13 +227,20 @@ GitLab's organization consists of *projects* and *groups*.
Projects are single repositories, and groups can be used to manage one or more projects at the same time.
In order to use ``create-sibling-gitlab``, a user **must** `create a group `_ via the web interface, or specify a pre-existing group, because `GitLab does not allow root-level groups to be created via their API `_.
Only when there already is a "parent" group DataLad and other tools can create sub-groups and projects automatically.
-In the screenshots below, a new group ``my-datalad-root-level-group`` is created right underneath the user account.
+In the screenshots :numref:`fig-rootgroup-gitlab1` and :numref:`fig-rootgroup-gitlab2`, a new group ``my-datalad-root-level-group`` is created right underneath the user account.
The group name as shown in the URL bar is what DataLad needs in order to create sibling datasets.
+.. _fig-rootgroup-gitlab1:
.. figure:: ../artwork/src/gitlab-rootgroup.png
+ :width: 80%
+
+ Webinterface to create a root-level group on GitLab.
+.. _fig-rootgroup-gitlab2:
.. figure:: ../artwork/src/gitlab-rootgroup2.png
+ :width: 80%
+ A created root-level group in GitLab's webinterface.
Step 3: Select a layout
"""""""""""""""""""""""
@@ -247,9 +271,12 @@ Consider the ``DataLad-101`` dataset, a superdataset with a several subdatasets
│ ├── [...]
-The ``collection`` and ``flat`` layouts for this dataset look like this in practice:
+How the ``collection`` and ``flat`` layouts for this dataset look in practice is shown in :numref:`fig-gitlab-layout`.
+
+.. _fig-gitlab-layout:
.. figure:: ../artwork/src/gitlab-layouts.png
+ :width: 50%
The ``collection`` layout has a group (``DataLad-101_collection``, defined by the user with a configuration) with four projects underneath. The ``project`` project contains the root-level dataset, and all contained subdatasets are named according to their location in the dataset. The ``flat`` layout consists of projects in the root-level group. The project name for the superdataset (``DataLad-101_flat``) is defined by the user with a configuration, and the names of the subdatasets extend this project name based on their location in the dataset hierarchy.
@@ -259,9 +286,9 @@ Publishing a single dataset
When publishing a single dataset, users can configure the project or group name as a command argument ``--project``.
Here are two command examples and their outcomes.
-For a **flat** layout, the ``--project`` parameter determines the project name:
+For a **flat** layout, the ``--project`` parameter determines the project name, shown in :numref:`fig-gitlab-flat`.
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-gitlab --site gitlab-general --layout flat --project my-datalad-root-level-group/this-will-be-the-project-name
create_sibling_gitlab(ok): . (dataset) [sibling repository 'gitlab' created at https://gitlab.com/my-datalad-root-level-group/this-will-be-the-project-name]
@@ -270,11 +297,16 @@ For a **flat** layout, the ``--project`` parameter determines the project name:
configure-sibling (ok: 1)
create_sibling_gitlab (ok: 1)
+.. _fig-gitlab-flat:
+
.. figure:: ../artwork/src/gitlab-layout-flat.png
+ :width: 50%
-For a **collection** layout, the ``--project`` parameter determines the group name:
+ An example dataset using GitLab's "flat" layout.
-.. code-block:: bash
+For a **collection** layout, the ``--project`` parameter determines the group name, shown in figure :numref:`fig-gitlab-collection`.
+
+.. code-block:: console
$ datalad create-sibling-gitlab --site gitlab-general --layout collection --project my-datalad-root-level-group/this-will-be-the-group-name
create_sibling_gitlab(ok): . (dataset) [sibling repository 'gitlab' created at https://gitlab.com/my-datalad-root-level-group/this-will-be-the-group-name/project]
@@ -283,28 +315,35 @@ For a **collection** layout, the ``--project`` parameter determines the group na
configure-sibling (ok: 1)
create_sibling_gitlab (ok: 1)
+.. _fig-gitlab-collection:
+
.. figure:: ../artwork/src/gitlab-layout-collection.png
+ :width: 50%
+
+ An example dataset using GitLab's "collection" layout.
Publishing datasets recursively
"""""""""""""""""""""""""""""""
-When publishing a series of datasets recursively, the ``--project`` argument can not be used anymore - otherwise, all datasets in the hierarchy would attempt to create the same group or project over and over again.
+When publishing a series of datasets recursively, the ``--project`` argument cannot be used anymore - otherwise, all datasets in the hierarchy would attempt to create the same group or project over and over again.
Instead, one configures the root level dataset, and the names for underlying datasets will be derived from this configuration:
-.. code-block:: bash
+.. index::
+ single: configuration item; datalad.gitlab--project
+.. code-block:: console
- # do the configuration for the top-most dataset
- # either configure with Git
+ $ # do the configuration for the top-most dataset
+ $ # either configure with Git
$ git config --local --replace-all \
datalad.gitlab--project \
'my-datalad-root-level-group/DataLad-101_flat'
- # or configure with DataLad
+ $ # or configure with DataLad
$ datalad configuration set \
datalad.gitlab--project='my-datalad-root-level-group/DataLad-101_flat'
Afterwards, publish dataset hierarchies with the ``--recursive`` flag:
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-gitlab --site gitlab-general --recursive --layout flat
create_sibling_gitlab(ok): . (dataset) [sibling repository 'gitlab' created at https://gitlab.com/my-datalad-root-level-group/DataLad-101_flat]
@@ -338,6 +377,9 @@ Once you have set up your dataset sibling(s), you can push individual datasets w
is protecting. If the private key does not have a passphrase, simply copying
this file grants a person access!
-.. [#f3] GitHub `deprecated user-password authentication `_ and only supports authentication via personal access token from November 13th 2020 onwards. Supplying a password instead of a token will fail to authenticate.
+.. [#f3] GitHub `deprecated user-password authentication `_ in favor of authentication via personal access token. Supplying a password instead of a token will fail to authenticate.
+.. index::
+ single: configuration item; datalad.gitlab-default-projectname
+ single: configuration item; datalad.gitlab-default-pathseparator
.. [#f4] The default project name ``project`` and path separator ``-`` are configurable using the dataset-level configurations ``datalad.gitlab-default-projectname`` and ``datalad.gitlab-default-pathseparator``
diff --git a/docs/basics/101-139-privacy.rst b/docs/basics/101-139-privacy.rst
index 975aaceac..22df5209b 100644
--- a/docs/basics/101-139-privacy.rst
+++ b/docs/basics/101-139-privacy.rst
@@ -23,6 +23,8 @@ However, it is highly important to realize that while annexed file's *contents*
If private information such as a medical patients non-anonymized ID or other potentially identifying information becomes a part of the file name, this information is exposed in the Git history of the dataset.
Keep in mind that this applies even if you renamed the file.
+.. index::
+ pair: remove sensitive information; with Git
.. find-out-more:: Help! I accidentally saved sensitive information to Git!
The only lasting way to remove contents from the dataset history completely is to substantially rewrite the dataset's history via tools such as ``git-filter-repo`` or ``git filter-branch``, two very dangerous and potentially destructive operations.
@@ -33,7 +35,7 @@ Strategy 2: Restrict access via third party service or file system permissions
When you have a dataset and only authorized actors should be allowed to access it,
it is possible to set access restrictions simply via choice of (third party) storage permissions.
-When it is an access restricted dataset on shared infrastructure, for example a scientific dataset that only researchers who signed a data usage agreement should have access to, it could suffice to create specific `Unix groups `_ with authorized users, and give only those groups the necessary permissions.
+When it is an access restricted dataset on shared infrastructure, for example, a scientific dataset that only researchers who signed a data usage agreement should have access to, it could suffice to create specific `Unix groups `_ with authorized users, and give only those groups the necessary permissions.
Depending on what permissions are set, unauthorized actors would not be able to retrieve file contents, or be able to clone the dataset at all.
The ability of repository hosting services to make datasets private and only allow select collaborators access is yet another method of keeping complete datasets as private as necessary, even though you should think twice on whether or not you should host sensitive repositories at all on these services.
@@ -55,11 +57,11 @@ Let's say you have a dataset with three files:
- ``subject_1.dat``
- ``subject_2.data``
-Consider that all of these files are annexed. While the information in ``experiment.txt`` is fine for everyone to see, ``subject_1.dat`` and ``subject_2.dat`` contain personal and potentially identifying data that can not be shared.
+Consider that all of these files are annexed. While the information in ``experiment.txt`` is fine for everyone to see, ``subject_1.dat`` and ``subject_2.dat`` contain personal and potentially identifying data that cannot be shared.
Nevertheless, you want collaborators to know that these files exist.
By publishing only the file contents of ``experiment.txt`` with
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to github experiment.txt
diff --git a/docs/basics/101-139-s3.rst b/docs/basics/101-139-s3.rst
index de2770220..71ba2fdd1 100644
--- a/docs/basics/101-139-s3.rst
+++ b/docs/basics/101-139-s3.rst
@@ -50,7 +50,7 @@ Key" or access existing credentials. Take note to copy both the "Access Key ID"
To ensure that your access key details are known when initializing the special
remote, export them as :term:`environment variable`\s in your shell:
-.. code-block:: bash
+.. code-block:: console
$ export AWS_ACCESS_KEY_ID="your-access-key-ID"
$ export AWS_SECRET_ACCESS_KEY="your-secret-access-key"
@@ -69,12 +69,12 @@ If you already have a small DataLad dataset to practice with, feel free to use i
during the rest of the walkthrough. If you do not have data, no problem! As a general
introduction, the steps below will download a small public neuroimaging dataset,
and transform it into a DataLad dataset. We'll use the `MoAEpilot `_
-dataset containing anatomical and functional images from a single subject, as well as some metadata.
+dataset containing anatomical and functional images from a single brain, as well as some metadata.
In the first step, we create a new directory called ``neuro-data-s3``, we download and extract the data,
and then we move the extracted contents into our new directory:
-.. code-block:: bash
+.. code-block:: console
$ cd
$ mkdir neuro-data-s3 && \
@@ -111,7 +111,7 @@ and then we move the extracted contents into our new directory:
Now we can view the directory tree to see the dataset content:
-.. code-block:: bash
+.. code-block:: console
$ tree
.
@@ -132,7 +132,7 @@ with ``main`` as the default branch.
We can turn our ``neuro-data-s3`` directory into a DataLad dataset with the
:dlcmd:`create --force` command. After that, we save the dataset with :dlcmd:`save`:
-.. code-block:: bash
+.. code-block:: console
$ datalad create --force --description "neuro data to host on s3"
[INFO ] Creating a new annex repo at /Users/jsheunis/Documents/neuro-data-s3
@@ -162,7 +162,7 @@ is that a :term:`sibling` is added to the DataLad dataset. This can be verified
by running :dlcmd:`siblings` before and after initializing the special
remote. Before, the only "sibling" is the actual DataLad dataset:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
.: here(+) [git]
@@ -174,7 +174,7 @@ that adheres to Amazon S3's `bucket naming rules ``
option. For consistency, we'll give the GitHub repository the same name as the dataset name.
-.. code-block:: bash
+.. code-block:: console
$ datalad create-sibling-github -d . neuro-data-s3 \
--publish-depends public-s3
@@ -245,7 +245,7 @@ on GitHub, which required preconfigured GitHub authentication details.
The creation of the sibling (named ``github``) can also be confirmed with :dlcmd:`siblings`:
-.. code-block:: bash
+.. code-block:: console
$ datalad siblings
.: here(+) [git]
@@ -257,7 +257,7 @@ to allow others to access the data. We do this with :dlcmd:`push --to github`.
The ``--to github`` specifies which sibling to push the dataset to, but because of the
publication dependency DataLad will push the annexed contents to the special remote first.
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to github
copy(ok): CHANGES (file) [to public-s3...]
@@ -292,7 +292,7 @@ You have now successfully created a DataLad dataset with an AWS S3 special remot
annexed file content and with a public GitHub sibling from which the dataset can be accessed.
Users can now :dlcmd:`clone` the dataset using the GitHub repository URL:
-.. code-block:: bash
+.. code-block:: console
$ cd /tmp
$ datalad clone https://github.com//neuro-data-s3.git
diff --git a/docs/basics/101-141-push.rst b/docs/basics/101-141-push.rst
index 66ca6440c..13eca05d9 100644
--- a/docs/basics/101-141-push.rst
+++ b/docs/basics/101-141-push.rst
@@ -1,7 +1,9 @@
+.. index::
+ pair: push; DataLad command
.. _push:
-Overview: The datalad push command
-----------------------------------
+The datalad push command
+------------------------
Previous sections on publishing DataLad datasets have each
shown you crucial aspects of the functions of dataset publishing with
@@ -14,6 +16,8 @@ The general overview
It is capable of publishing all dataset content, i.e., files stored in :term:`Git`,
and files stored with :term:`git-annex`, to a known dataset :term:`sibling`.
+.. index::
+ pair: push; DataLad concept
.. gitusernote:: Push internals
The :dlcmd:`push` uses ``git push``, and ``git annex copy`` under
@@ -21,26 +25,30 @@ and files stored with :term:`git-annex`, to a known dataset :term:`sibling`.
or git-annex special remotes (if they support data upload).
In order to publish a dataset, the dataset needs to have a sibling to push to.
-This, for instance, can be a :term:`GitHub`, :term:`GitLab`, or :term:`Gin`
+This, for instance, can be a :term:`GitHub`, :term:`GitLab`, or :term:`GIN`
repository, but it can also be a Remote Indexed Archive (RIA) store for backup
or storage of datasets [#f1]_, or a regular clone.
+.. index::
+ pair: create-sibling-github; DataLad command
+ pair: create-sibling-gitlab; DataLad command
+ pair: create-sibling-ria; DataLad command
+ pair: GitHub; dataset hosting
+ pair: GitLab; dataset hosting
+ pair: RIA; dataset hosting
+ pair: create sibling; with DataLad
.. find-out-more:: all of the ways to configure siblings
- .. index:: ! datalad command; create-sibling-github
- .. index:: ! datalad command; create-sibling-gitlab
- .. index:: ! datalad command; create-sibling-ria
-
- Add an existing repository as a sibling with the :dlcmd:`siblings`
command. Here are common examples:
- .. code-block:: bash
+ .. code-block:: console
- # to a remote repository
+ $ # to a remote repository
$ datalad siblings add --name github-repo --url
- # to a local path
+ $ # to a local path
$ datalad siblings add --name local-sibling --url /path/to/sibling/ds
- # to a clone on an SSH-accessible machine
+ $ # to a clone on an SSH-accessible machine
$ datalad siblings add --name server-sibling --url [user@]hostname:/path/to/sibling/ds
- Create a sibling on an external hosting service from scratch, right from
@@ -59,7 +67,7 @@ In order to publish dataset content, DataLad needs to know to which sibling
content shall be pushed. This can be specified with the ``--to`` option directly
from the command line:
-.. code-block:: bash
+.. code-block:: console
$ datalad push --to
@@ -74,7 +82,7 @@ no push is attempted.
Additionally, :dlcmd:`push` will attempt to automatically decide what type
of dataset contents are going to be published. With a sibling that has a
:term:`special remote` configured as a :term:`publication dependency`,
-or a sibling that contains an annex (such as a Gin repository or a
+or a sibling that contains an annex (such as a GIN repository or a
:term:`Remote Indexed Archive (RIA) store`), both the contents
stored in Git (i.e., a dataset's history) as well as file contents stored in
git-annex will be published unless dataset configurations overrule this.
@@ -116,6 +124,8 @@ that should be published have sibling names identical to the sibling specified i
the top-level :dlcmd:`push` command, or that appropriate default publication
targets are configured throughout the dataset hierarchy.
+.. index::
+ pair: configure which branches to push; with Git
.. find-out-more:: Pushing more than the current branch
:name: fom-push-branch
:float:
@@ -130,7 +140,7 @@ targets are configured throughout the dataset hierarchy.
:dlcmd:`push` pushes *all* branches to the sibling.
A concrete example: On a dataset level, this can be done using
- .. code-block:: bash
+ .. code-block:: console
$ git config --local push.default matching
@@ -148,7 +158,7 @@ targets are configured throughout the dataset hierarchy.
automatically as well. This configuration was achieved by specifying these branches
(using :term:`globbing` with ``*``) in the ``push`` specification of this :term:`remote`:
- .. code-block:: bash
+ .. code-block:: console
$ git config --local remote.public.push 'refs/heads/sct*'
@@ -172,6 +182,6 @@ For more information on this, and other error messages during push, please check
.. rubric:: Footnotes
-.. [#f1] RIA siblings are filesystem-based, scalable storage solutions for
- DataLad datasets. You can find out more about them in the online version.
+.. [#f1] RIA siblings are file system based, scalable storage solutions for
+ DataLad datasets. You can find out more about them in the online-handbook.
.. [#f2] For information on the ``numcopies`` and ``wanted`` settings of git-annex see its documentation at `git-annex.branchable.com/git-annex-wanted/ `_ and `git-annex.branchable.com/git-annex-numcopies/ `_.
diff --git a/docs/basics/101-146-gists.rst b/docs/basics/101-146-gists.rst
index e45a6e814..6de39d92e 100644
--- a/docs/basics/101-146-gists.rst
+++ b/docs/basics/101-146-gists.rst
@@ -16,8 +16,9 @@ take a look at the :ref:`cheat`. The
`tips collection of git-annex `_ is also
a very valuable resource.
-.. figure:: ../artwork/src/gists.svg
+.. image:: ../artwork/src/gists.svg
:width: 50%
+ :align: center
.. _parallelize:
@@ -25,9 +26,9 @@ a very valuable resource.
Parallelize subdataset processing
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-DataLad can not yet parallelize processes that are performed
+DataLad cannot yet parallelize processes that are performed
independently over a large number of subdatasets. Pushing across a dataset
-hierarchy for example, is performed one after the other.
+hierarchy, for example, is performed one after the other.
Unix however, has a few tools such as `xargs `_
or the ``parallel`` tool of `moreutils `_
that can assist.
@@ -35,7 +36,7 @@ that can assist.
Here is an example of pushing all subdatasets (and their respective subdatasets)
recursively to their (identically named) siblings:
-.. code-block:: bash
+.. code-block:: console
$ datalad -f '{path}' subdatasets | xargs -n 1 -P 10 datalad push -r --to -d
@@ -46,7 +47,7 @@ parallelization.
Here is an example of cross-dataset download parallelization:
-.. code-block:: bash
+.. code-block:: console
$ datalad -f '{path}' subdatasets | xargs -n 1 -P 10 datalad get -d
@@ -59,21 +60,21 @@ Check whether all file content is present locally
In order to check if all the files in a dataset have their file contents locally
available, you can ask git-annex:
-.. code-block:: bash
+.. code-block:: console
$ git annex find --not --in=here
Any file that does not have its contents locally available will be listed.
If there are subdatasets you want to recurse into, use the following command
-.. code-block:: bash
+.. code-block:: console
$ git submodule foreach --quiet --recursive \
'git annex find --not --in=here --format=$displaypath/$\\{file\\}\\n'
Alternatively, to get very comprehensive output, you can use
-.. code-block:: bash
+.. code-block:: console
$ datalad -f json status --recursive --annex availability
@@ -81,7 +82,7 @@ The output will be returned as json, and the key ``has_content`` indicates local
content availability (``true`` or ``false``). To filter through it, the command
line tool `jq `_ works well:
-.. code-block:: bash
+.. code-block:: console
$ datalad -f json status --recursive --annex all | jq '. | select(.has_content == true).path'
@@ -97,7 +98,7 @@ you can find out about it and remove this file content out of your dataset
Find out which file contents are unused (not referenced by any current branch):
-.. code-block:: bash
+.. code-block:: console
$ git annex unused
unused . (checking for unused data...)
@@ -111,20 +112,20 @@ Find out which file contents are unused (not referenced by any current branch):
Remove a single unused file by specifying its number in the listing above:
-.. code-block:: bash
+.. code-block:: console
$ git annex dropunused 1
dropunused 1 ok
Or a range of unused data with
-.. code-block:: bash
+.. code-block:: console
$ git annex dropunused 1-1000
Or all
-.. code-block:: bash
+.. code-block:: console
$ git annex dropunused all
@@ -154,39 +155,40 @@ they are created with the ``--no-annex`` flag, for example), or AnnexRepos
is is stored in the dataset report of :dlcmd:`wtf` under the key ``repo``.
Here is a one-liner to get this info:
-.. code-block:: bash
+.. code-block:: console
$ datalad -f'{infos[dataset][repo]}' wtf
+.. index::
+ pair: create-sibling; DataLad command
+
Backing-up datasets
^^^^^^^^^^^^^^^^^^^
-.. index:: ! datalad command; create-sibling
-
In order to back-up datasets you can publish them to a
:term:`Remote Indexed Archive (RIA) store` or to a sibling dataset. The former
solution does not require Git, git-annex, or DataLad to be installed on the
machine that the back-up is pushed to, the latter does require them.
-To find out more about RIA stores, checkout the online version of the handbook.
+To find out more about RIA stores, checkout the online-handbook.
A sketch of how to implement a sibling for backups is below:
-.. code-block:: bash
+.. code-block:: console
- # create a back up sibling
- datalad create-sibling --annex-wanted anything -r myserver:/path/to/backup
- # publish a full backup of the current branch
- datalad publish --to=myserver -r
- # subsequently, publish updates to be backed up with
- datalad publish --to=myserver -r --since= --missing=inherit
+ $ # create a back up sibling
+ $ datalad create-sibling --annex-wanted anything -r myserver:/path/to/backup
+ $ # publish a full backup of the current branch
+ $ datalad publish --to=myserver -r
+ $ # subsequently, publish updates to be backed up with
+ $ datalad publish --to=myserver -r --since= --missing=inherit
In order to push not only the current branch, but refs, add the option
``--publish-by-default "refs/*"`` to the :dlcmd:`create-sibling` call.
Should you want to back up all annexed data, even past versions of files, use
:gitannexcmd:`sync` to push to the sibling:
-.. code-block:: bash
+.. code-block:: console
$ git annex sync --all --content
@@ -209,7 +211,7 @@ If all subject-subdatasets are installed (e.g., with ``datalad get -n -r`` for
a recursive installation without file retrieval), :term:`globbing` with the
shell works fine:
-.. code-block:: bash
+.. code-block:: console
$ datalad get HCP1200/*/T1W/ribbon.nii.gz
@@ -219,7 +221,7 @@ shell can't expand non-existent paths. As an alternative, you can pipe the outpu
of an (arbitrarily complex) :dlcmd:`search` command into
:dlcmd:`get`:
-.. code-block:: bash
+.. code-block:: console
$ datalad -f '{path}' -c datalad.search.index-egrep-documenttype=all search 'path:.*T1w.*\.nii.gz' | xargs -n 100 datalad get
@@ -227,11 +229,11 @@ However, if you know the file locations within the dataset hierarchy and they
are predictably named and consistent, you can create a file containing all paths to
be retrieved and pipe that into :dlcmd:`get` as well:
-.. code-block:: bash
+.. code-block:: console
- # create file with all file paths
+ $ # create file with all file paths
$ for sub in HCP1200/*; do echo ${sub}/T1w/ribbons.nii.gz; done > toget.txt
- # pipe it into datalad get
+ $ # pipe it into datalad get
$ cat toget.txt | xargs -n 100 datalad get
.. _speedystatus:
@@ -263,7 +265,7 @@ There are, however, ways to shrink the commit history in the annex branch.
In order to :term:`squash` the entire git-annex history into a single commit, run
-.. code-block:: bash
+.. code-block:: console
$ git annex forget --drop-dead --force
diff --git a/docs/basics/101-180-FAQ.rst b/docs/basics/101-180-FAQ.rst
index 5ce2a9a17..8828161b8 100644
--- a/docs/basics/101-180-FAQ.rst
+++ b/docs/basics/101-180-FAQ.rst
@@ -1,6 +1,6 @@
.. _FAQ:
-Frequently Asked Questions
+Frequently asked questions
--------------------------
This section answers frequently asked questions about high-level DataLad
@@ -82,7 +82,7 @@ and functions:
Whereas git and git-annex would require the caller to first cd to the target
repository, DataLad figures out which repository the given paths belong to and
then works within that repository.
- :dlcmd:`save . --recursive` will solve the subdataset problem above
+ :dlcmd:`save . --recursive` will solve the subdataset problem above,
for example, no matter what was changed/added, no matter where in a tree
of subdatasets.
- DataLad provides users with the ability to act on "virtual" file paths. If
@@ -144,7 +144,7 @@ How can I convert/import/transform an existing Git or git-annex repository into
You can transform any existing Git or git-annex repository of yours into a
DataLad dataset by running:
-.. code-block:: bash
+.. code-block:: console
$ datalad create -f
@@ -305,7 +305,7 @@ If you do not want to invent a description yourself, you can run
After cloning a dataset, you can retrieve file contents by running
```
- datalad get `
+ datalad get
```
This command will trigger a download of the files, directories, or
@@ -340,8 +340,8 @@ If you do not want to invent a description yourself, you can run
### Find out what has been done
- DataLad datasets contain their history in the ``git log``.
- By running ``git log`` (or a tool that displays Git history) in the dataset or on
+ DataLad datasets contain their history in the `git log`.
+ By running `git log` (or a tool that displays Git history) in the dataset or on
specific files, you can find out what has been done to the dataset or to individual files
by whom, and when.
@@ -391,7 +391,7 @@ If you do not want to invent a description yourself, you can run
subdatasets, run 'datalad get -n '
Afterwards, you can browse the retrieved metadata to find out about
- subdataset contents, and retrieve individual files with `datalad get`.
+ subdataset contents, and retrieve individual files with 'datalad get'.
If you use 'datalad get ', all contents of the
subdataset will be downloaded at once.
@@ -469,9 +469,9 @@ copied. Therefore you can :
- either unlock the files before copying them out,
- or copy them and then use the command ``chmod`` to be able to edit the file.
-.. code-block:: bash
+.. code-block:: console
- # this will give you 'write' permission on the file
+ $ # this will give you 'write' permission on the file
$ chmod +w filename
If you are not familiar with how the ``chmod`` works (or if you forgot - let's be honest we
@@ -548,7 +548,7 @@ Here is an example:
.. figure:: ../artwork/src/defaultgitannex_light.png
-This is related to GitHub's decision to make ``main`` `the default branch for newly created repositories `_ -- datasets that do not have a ``main`` branch (but for example a ``master`` branch) may end up with a different branch being displayed on GitHub than intended.
+This is related to GitHub's decision to make ``main`` `the default branch for newly created repositories `_ -- datasets that do not have a ``main`` branch (but, for example, a ``master`` branch) may end up with a different branch being displayed on GitHub than intended.
To fix this for present and/or future datasets, the default branch can be configured to a branch name of your choice on a repository- or organizational level `via GitHub's web-interface `_.
Alternatively, you can rename existing ``master`` branches into ``main`` using ``git branch -m master main`` (but beware of unforeseen consequences - your collaborators may try to ``update`` the ``master`` branch but fail, continuous integration workflows could still try to use ``master``, etc.).
diff --git a/docs/basics/_examples/DL-101-101-101 b/docs/basics/_examples/DL-101-101-101
index 4ef899e36..51fd58921 100644
--- a/docs/basics/_examples/DL-101-101-101
+++ b/docs/basics/_examples/DL-101-101-101
@@ -4,6 +4,3 @@ $ datalad create -c text2git DataLad-101
[INFO] == Command exit (modification check follows) =====
run(ok): /home/me/dl-101/DataLad-101 (dataset) [VIRTUALENV/bin/python /home/a...]
create(ok): /home/me/dl-101/DataLad-101 (dataset)
-action summary:
- create (ok: 1)
- run (ok: 1)
diff --git a/docs/basics/_examples/DL-101-102-103 b/docs/basics/_examples/DL-101-102-103
index 252231393..b11473373 100644
--- a/docs/basics/_examples/DL-101-102-103
+++ b/docs/basics/_examples/DL-101-102-103
@@ -3,5 +3,5 @@ $ wget -q https://sourceforge.net/projects/linuxcommand/files/TLCL/19.01/TLCL-19
-O TLCL.pdf
$ wget -q https://homepages.uc.edu/~becktl/byte_of_python.pdf \
-O byte-of-python.pdf
-# get back into the root of the dataset
+$ # get back into the root of the dataset
$ cd ../
diff --git a/docs/basics/_examples/DL-101-102-106 b/docs/basics/_examples/DL-101-102-106
index 32a211f31..e691719fb 100644
--- a/docs/basics/_examples/DL-101-102-106
+++ b/docs/basics/_examples/DL-101-102-106
@@ -2,6 +2,3 @@ $ datalad save -m "add books on Python and Unix to read later"
add(ok): books/TLCL.pdf (file)
add(ok): books/byte-of-python.pdf (file)
save(ok): . (dataset)
-action summary:
- add (ok: 2)
- save (ok: 1)
diff --git a/docs/basics/_examples/DL-101-102-107 b/docs/basics/_examples/DL-101-102-107
index 074dbc1f4..390bf564c 100644
--- a/docs/basics/_examples/DL-101-102-107
+++ b/docs/basics/_examples/DL-101-102-107
@@ -1,5 +1,5 @@
$ git log -p -n 1
-commit d2bb1a86✂SHA1
+commit b40316a6✂SHA1
Author: Elena Piscopia
Date: Tue Jun 18 16:13:00 2019 +0000
@@ -15,9 +15,9 @@ index 0000000..4c84b61
\ No newline at end of file
diff --git a/books/byte-of-python.pdf b/books/byte-of-python.pdf
new file mode 120000
-index 0000000..b9fcbff
+index 0000000..7a6e51e
--- /dev/null
+++ b/books/byte-of-python.pdf
@@ -0,0 +1 @@
-+../.git/annex/objects/P5/qK/✂/MD5E-s2693891--e61afe4b✂MD5.pdf
++../.git/annex/objects/xF/42/✂/MD5E-s4161086--c832fc13✂MD5.pdf
\ No newline at end of file
diff --git a/docs/basics/_examples/DL-101-102-110 b/docs/basics/_examples/DL-101-102-110
index f569ce228..584f52a4a 100644
--- a/docs/basics/_examples/DL-101-102-110
+++ b/docs/basics/_examples/DL-101-102-110
@@ -1,6 +1,3 @@
$ datalad save -m "add reference book about git" books/progit.pdf
add(ok): books/progit.pdf (file)
save(ok): . (dataset)
-action summary:
- add (ok: 1)
- save (ok: 1)
diff --git a/docs/basics/_examples/DL-101-102-111 b/docs/basics/_examples/DL-101-102-111
index 2e7b143ff..acec55c72 100644
--- a/docs/basics/_examples/DL-101-102-111
+++ b/docs/basics/_examples/DL-101-102-111
@@ -1,6 +1,6 @@
-# lets make the output a bit more concise with the --oneline option
+$ # lets make the output a bit more concise with the --oneline option
$ git log --oneline
-8e5dc80 add reference book about git
-d2bb1a8 add books on Python and Unix to read later
+a875e49 add reference book about git
+b40316a add books on Python and Unix to read later
e0ff3a7 Instruct annex to add text files to Git
4ce681d [DATALAD] new dataset
diff --git a/docs/basics/_examples/DL-101-102-112 b/docs/basics/_examples/DL-101-102-112
index 6796f302e..2014e5e9a 100644
--- a/docs/basics/_examples/DL-101-102-112
+++ b/docs/basics/_examples/DL-101-102-112
@@ -3,11 +3,6 @@ $ datalad download-url \
--dataset . \
-m "add beginners guide on bash" \
-O books/bash_guide.pdf
-[INFO] Downloading 'https://www.tldp.org/LDP/Bash-Beginners-Guide/Bash-Beginners-Guide.pdf' into '/home/me/dl-101/DataLad-101/books/bash_guide.pdf'
download_url(ok): /home/me/dl-101/DataLad-101/books/bash_guide.pdf (file)
add(ok): books/bash_guide.pdf (file)
save(ok): . (dataset)
-action summary:
- add (ok: 1)
- download_url (ok: 1)
- save (ok: 1)
diff --git a/docs/basics/_examples/DL-101-102-115 b/docs/basics/_examples/DL-101-102-115
index d4586eaef..675df6c33 100644
--- a/docs/basics/_examples/DL-101-102-115
+++ b/docs/basics/_examples/DL-101-102-115
@@ -1,5 +1,5 @@
$ git log -p -n 1
-commit bcb0ffe8✂SHA1
+commit 59ac8d32✂SHA1
Author: Elena Piscopia