Use more inclusive terms in code and documentation.

PiperOrigin-RevId: 354289370
google-research · Jan 28, 2021 · ec57aeb · ec57aeb
1 parent be22854
commit ec57aeb
Show file tree

Hide file tree

Showing 12 changed files with 97 additions and 92 deletions.
diff --git a/Intro_to_Metadataset.ipynb b/Intro_to_Metadataset.ipynb
@@ -147,7 +147,7 @@
       },
       "source": [
         "# Primers\n",
-        "1. Download your data and process it as explained in [link](https://github.com/google-research/meta-dataset/blob/master/README.md#downloading-and-converting-datasets). Set `BASE_PATH` pointing the processed tf-records (`$RECORDS` in the conversion instructions).\n",
+        "1. Download your data and process it as explained in [link](https://github.com/google-research/meta-dataset/blob/main/README.md#downloading-and-converting-datasets). Set `BASE_PATH` pointing the processed tf-records (`$RECORDS` in the conversion instructions).\n",
         "2. `meta_dataset` supports many different setting for sampling data. We use [gin-config](https://github.com/google/gin-config) to control default parameters of our functions. You can go to default gin file we are pointing and see the default values.\n",
         "3. You can use `meta_dataset` in **eager** or **graph** mode.\n",
         "4. Let's write a generator that makes the right calls to return data from dataset. `dataset.make_one_shot_iterator()` returns an iterator where each element is an episode.\n",

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ See below for [user instructions](#user-instructions), including how to:
 3.  [train](#training) implemented models.
 
 See this
-[introduction notebook](https://github.com/google-research/meta-dataset/blob/master/Intro_to_Metadataset.ipynb)
+[introduction notebook](https://github.com/google-research/meta-dataset/blob/main/Intro_to_Metadataset.ipynb)
 for a demonstration of how to sample data from the pipeline (episodes or
 batches).
 
@@ -69,7 +69,7 @@ SOTA on Meta-Dataset (train-on-ILSVRC) as of NeurIPS 2020.
 # Leaderboard (in progress)
 
 The tables below were generated by
-[this notebook](https://github.com/google-research/meta-dataset/blob/master/Leaderboard.ipynb).
+[this notebook](https://github.com/google-research/meta-dataset/blob/main/Leaderboard.ipynb).
 
 ## Adding a new model to the leaderboard
 
@@ -189,7 +189,7 @@ variables are defined:
     about which classes are part of the meta-training, meta-validation, and
     meta-test set. These files are only used during the dataset conversion
     phase, but can help troubleshooting later. To re-use the
-    [canonical splits](https://github.com/google-research/meta-dataset/tree/master/meta_dataset/dataset_conversion/splits)
+    [canonical splits](https://github.com/google-research/meta-dataset/tree/main/meta_dataset/dataset_conversion/splits)
     instead of re-generating them, you can make it point to
     `meta_dataset/dataset_conversion` in your checkout.
 -   `$RECORDS`: root directory that will contain the converted datasets (one per

diff --git a/doc/dataset_conversion.md b/doc/dataset_conversion.md
@@ -242,7 +242,7 @@ more context.
 ## Notes
 
 1. A [reference version](
-https://github.com/google-research/meta-dataset/tree/master//meta_datasetdataset_conversion/dataset_specs)
+https://github.com/google-research/meta-dataset/tree/main//meta_datasetdataset_conversion/dataset_specs)
 of each of the `dataset_spec.json` files is part of this repository. You can
 compare them with the version generated by the conversion process for
 troubleshooting.
diff --git a/meta_dataset/data/dataset_spec.py b/meta_dataset/data/dataset_spec.py
@@ -517,7 +517,7 @@ class id's relative to the split (between 0 and num classes in split).
         id's relative to the dataset (between 0 and the total num classes).
     """
     # The number of classes before the start of superclass_id, i.e. the class id
-    # of the first class of the given superclass.
+    # of the first (sub-)class of the given superclass.
     superclass_offset = self._count_classes_in_superclasses(
         range(superclass_id))
 

diff --git a/meta_dataset/data/pipeline.py b/meta_dataset/data/pipeline.py
@@ -46,16 +46,16 @@
                       'The strength of color jittering for SimCLR episodes.')
 
 
-def filter_dummy_examples(example_strings, class_ids):
-  """Returns tensors with only actual examples, filtering out the dummy ones.
+def filter_placeholders(example_strings, class_ids):
+  """Returns tensors with only actual examples, filtering out the placeholders.
 
-  Actual examples are the first ones in the tensors, and followed by dummy ones,
-  indicated by negative class IDs.
+  Actual examples are the first ones in the tensors, and followed by placeholder
+  ones, indicated by negative class IDs.
 
   Args:
     example_strings: 1-D Tensor of dtype str, Example protocol buffers.
     class_ids: 1-D Tensor of dtype int, class IDs (absolute wrt the original
-      dataset, except for negative ones, that indicate dummy examples).
+      dataset, except for negative ones, that indicate placeholder examples).
   """
   num_actual = tf.reduce_sum(tf.cast(class_ids >= 0, tf.int32))
   actual_example_strings = example_strings[:num_actual]
@@ -84,7 +84,7 @@ def flush_and_chunk_episode(example_strings, class_ids, chunk_sizes):
   1) splits the batch of examples into a "flush" chunk and some number of
      additional chunks (as determined by `chunk_sizes`),
   2) throws away the "flush" chunk, and
-  3) removes the padded dummy examples from the additional chunks.
+  3) removes the padded placeholder examples from the additional chunks.
 
   For example, in the context of few-shot learning, where episodes are composed
   of a support set and a query set, `chunk_size = (150, 100, 50)` would be
@@ -107,7 +107,7 @@ def flush_and_chunk_episode(example_strings, class_ids, chunk_sizes):
   class_ids_chunks = tf.split(class_ids, num_or_size_splits=chunk_sizes)[1:]
 
   return tuple(
-      filter_dummy_examples(strings, ids)
+      filter_placeholders(strings, ids)
       for strings, ids in zip(example_strings_chunks, class_ids_chunks))
 
 
@@ -254,7 +254,8 @@ def process_episode(example_strings, class_ids, chunk_sizes, image_size,
 
   1) splits the batch of examples into "flush", "support", and "query" chunks,
   2) throws away the "flush" chunk,
-  3) removes the padded dummy examples from the "support" and "query" chunks,
+  3) removes the padded placeholder examples from the "support" and "query"
+     chunks,
   4) extracts and processes images out of the example strings, and
   5) builds support and query targets (numbers from 0 to K-1 where K is the
      number of classes in the episode) from the class IDs.
@@ -430,9 +431,9 @@ def make_one_source_episode_pipeline(dataset_spec,
       ignore_hierarchy_probability=ignore_hierarchy_probability)
   dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool)
   # Episodes coming out of `dataset` contain flushed examples and are internally
-  # padded with dummy examples. `process_episode` discards flushed examples,
-  # splits the episode into support and query sets, removes the dummy examples
-  # and decodes the example strings.
+  # padded with placeholder examples. `process_episode` discards flushed
+  # examples, splits the episode into support and query sets, removes the
+  # placeholder examples and decodes the example strings.
   chunk_sizes = sampler.compute_chunk_sizes()
   map_fn = functools.partial(
       process_episode,
@@ -531,9 +532,9 @@ def make_multisource_episode_pipeline(dataset_spec_list,
   dataset = tf.data.experimental.sample_from_datasets(sources)
 
   # Episodes coming out of `dataset` contain flushed examples and are internally
-  # padded with dummy examples. `process_episode` discards flushed examples,
-  # splits the episode into support and query sets, removes the dummy examples
-  # and decodes the example strings.
+  # padded with placeholder examples. `process_episode` discards
+  # flushed examples, splits the episode into support and query sets, removes
+  # the placeholder examples and decodes the example strings.
   chunk_sizes = sampler.compute_chunk_sizes()
 
   def map_fn(episode, source_id):

diff --git a/meta_dataset/data/providers.py b/meta_dataset/data/providers.py
@@ -100,12 +100,12 @@ def unique_labels(self):
 
   @property
   def labels(self):
-    """Return query labels to provide an episodic/batch-agnostic API."""
+    """Return query labels to provide an episode/batch-independent API."""
     return self.query_labels
 
   @property
   def onehot_labels(self):
-    """Return one-hot query labels to provide an episodic/batch-agnostic API."""
+    """Return one-hot query labels to provide an episode/batch-independent API."""
     return self.onehot_query_labels
 
   @property
@@ -166,12 +166,12 @@ def query_shots(self):
 
   @property
   def labels(self):
-    """Return local query labels to provide an episodic/batch-agnostic API."""
+    """Return local query labels to provide an episode/batch-independent API."""
     return self.query_labels
 
   @property
   def onehot_labels(self):
-    """Return local one-hot query labels for episodic/batch-agnostic API."""
+    """Return local one-hot query labels for episode/batch-independent API."""
     return self.onehot_query_labels
 
   @property

diff --git a/meta_dataset/data/reader.py b/meta_dataset/data/reader.py
@@ -19,7 +19,7 @@
 The data output by the Reader consists in episodes or batches (for EpisodeReader
 and BatchReader respectively) from one source (one split of a dataset). They
 contain strings represented images that have not been decoded yet, and can
-contain dummy examples and examples to discard.
+contain placeholder examples and examples to discard.
 See data/pipeline.py for the next stage of the pipeline.
 """
 # TODO(lamblinp): Update variable names to be more consistent
@@ -38,22 +38,23 @@
 from six.moves import range
 import tensorflow.compat.v1 as tf
 
-# DUMMY_CLASS_ID will be used as the target of examples used for padding only.
-DUMMY_CLASS_ID = -1
+# PLACEHOLDER_CLASS_ID will be used as the target of placeholder examples, that
+# are used for padding only.
+PLACEHOLDER_CLASS_ID = -1
 
 
-def _pad(dataset_indices, chunk_size, dummy_dataset_id):
-  """Pads `dataset_indices` with dummy values so it has length `chunk_size`.
+def _pad(dataset_indices, chunk_size, placeholder_dataset_id):
+  """Pads `dataset_indices` with placeholders so it has length `chunk_size`.
 
   Args:
     dataset_indices: list of (dataset_id, num_repeats) tuples representing a
       sequence of dataset IDs.
     chunk_size: int, size to pad to.
-    dummy_dataset_id: int, dummy value to pad with.
+    placeholder_dataset_id: int, placeholder value to pad with.
   """
   pad_size = chunk_size - sum(n for i, n in dataset_indices)
   assert pad_size >= 0
-  dataset_indices.append([dummy_dataset_id, pad_size])
+  dataset_indices.append([placeholder_dataset_id, pad_size])
 
 
 def episode_representation_generator(dataset_spec, split, pool, sampler):
@@ -69,11 +70,11 @@ def episode_representation_generator(dataset_spec, split, pool, sampler):
 
   To make sure the input pipeline knows where the episode boundary is within the
   stream (and where the boundary is between chunks in an episode), we enforce
-  that each chunk has a fixed size by padding with dummy dataset IDs (of value
-  `num_classes`) as needed (in some cases it's possible that no padding is ever
-  needed). The size of each chunk is prescribed by the `compute_chunk_sizes`
-  method of `sampler`, which also implicitly defines the number of additional
-  chunks (i.e. `len(chunk_sizes) - 1`).
+  that each chunk has a fixed size by padding with placeholder dataset IDs (of
+  value `num_classes`) as needed (in some cases it's possible that no padding is
+  ever needed). The size of each chunk is prescribed by the
+  `compute_chunk_sizes` method of `sampler`, which also implicitly defines the
+  number of additional chunks (i.e. `len(chunk_sizes) - 1`).
 
   Instead of explicitly representing all elements of the dataset ID stream, this
   generator returns a compact representation where repeated elements are
@@ -83,9 +84,9 @@ def episode_representation_generator(dataset_spec, split, pool, sampler):
   `tf.data.experimental.choose_from_datasets` and assumes that the list of
   tf.data.Dataset objects corresponding to each class in the dataset (there are
   `num_classes` of them, which is determined by inspecting the `dataset_spec`
-  argument using the `split` argument) is appended with a "dummy" Dataset (which
-  has index `num_classes` in the list) which outputs a constant `(b'',
-  DUMMY_CLASS_ID)` tuple).
+  argument using the `split` argument) is appended with a placeholder Dataset
+  (which has index `num_classes` in the list) which outputs a constant `(b'',
+  PLACEHOLDER_CLASS_ID)` tuple).
 
   Note that a dataset ID is different from the (absolute) class ID: the dataset
   ID refers to the index of the Dataset in the list of Dataset objects, and the
@@ -111,7 +112,7 @@ class ID (or label) refers to the second element of the tuple that the Dataset
 
   class_set = dataset_spec.get_classes(split)
   num_classes = len(class_set)
-  dummy_dataset_id = num_classes
+  placeholder_dataset_id = num_classes
 
   total_images_per_class = dict(
       (class_idx,
@@ -155,12 +156,12 @@ class ID (or label) refers to the second element of the tuple that the Dataset
       cursors[class_idx] += total_requested
 
     # An episode sequence is generated in multiple phases, each padded with an
-    # agreed-upon number of dummy dataset IDs.
+    # agreed-upon number of placeholder dataset IDs.
 
-    _pad(flushed_dataset_indices, flush_chunk_size, dummy_dataset_id)
+    _pad(flushed_dataset_indices, flush_chunk_size, placeholder_dataset_id)
     for dataset_indices, chunk_size in zip(selected_dataset_indices,
                                            other_chunk_sizes):
-      _pad(dataset_indices, chunk_size, dummy_dataset_id)
+      _pad(dataset_indices, chunk_size, placeholder_dataset_id)
 
     episode_representation = np.array(
         list(
@@ -333,7 +334,7 @@ def create_dataset_input_pipeline(self,
     Returns:
       dataset: a tf.data.Dataset instance which encapsulates episode creation
         for the data identified by `dataset_spec` and `split`. These episodes
-        contain flushed examples and are internally padded with dummy examples.
+        contain flushed examples and are internally padded with placeholders.
         A later part of the pipeline, shared across all sources, will extract
         support and query sets and decode the example strings.
     """
@@ -342,12 +343,12 @@ def create_dataset_input_pipeline(self,
     class_datasets = self.construct_class_datasets(
         pool=pool, shuffle=shuffle, shuffle_seed=shuffle_seed)
 
-    # We also construct a dummy dataset which outputs `(b'', DUMMY_CLASS_ID)`
-    # tuples.
-    dummy_dataset = tf.data.Dataset.zip(
+    # We also construct a placeholder dataset which outputs
+    # `(b'', PLACEHOLDER_CLASS_ID)` tuples.
+    placeholder_dataset = tf.data.Dataset.zip(
         (tf.data.Dataset.from_tensors(b'').repeat(),
-         tf.data.Dataset.from_tensors(DUMMY_CLASS_ID).repeat()))
-    class_datasets.append(dummy_dataset)
+         tf.data.Dataset.from_tensors(PLACEHOLDER_CLASS_ID).repeat()))
+    class_datasets.append(placeholder_dataset)
 
     # The "choice" dataset outputs a stream of dataset IDs which are used to
     # select which class dataset to sample from. We turn the stream of dataset