taehyounpark
diff --git a/‎docs/pages/guide.md
Lines changed: 153 additions & 101 deletions b/‎docs/pages/guide.md
Lines changed: 153 additions & 101 deletions
@@ -43,24 +43,31 @@ Call queryosity::dataflow::load() with an input dataset and its constructor argu
 The loaded dataset can then read out columns, provided their data types and names.
 
 @cpp
-using json = qty::json;
+std::ifstream data_json("data.json");
 
-std::ifstream data("data.json");
-auto ds = df.load(dataset::input<json>(data));
+// load a dataset
+using json = qty::json;
+auto ds = df.load(dataset::input<json>(data_json));
 
+// read a column
 auto x = ds.read(dataset::column<double>("x"));
+
+// shortcut: read multiple columns from a dataset
+auto [w, cat] = ds.read(dataset::column<double>("weight"), dataset::column<std::string>("category"));
 @endcpp
 
 A dataflow can load multiple datasets, as long as all valid partitions reported by queryosity::dataset::source::partition() have the same number of total entries.
 A dataset can report an empty partition, which signals that it relinquishes the control to the other datasets.
 
 @cpp
-using csv = qty::csv;
-
 std::ifstream data_csv("data.csv");
-auto y = df.load(dataset::input<csv>(data_csv)).read(dataset::column<double>("y"));
 
-auto z = x+y;
+// another shortcut: load dataset & read column(s) at once
+using csv = qty::csv;
+auto y = df.read(dataset::input<csv>(data_csv), dataset::column<double>("y"));
+
+// x from json, y from csv
+auto z = x + y; // (see next section)
 @endcpp
 
 @see 
@@ -76,94 +83,116 @@ auto z = x+y;
 
 @section guide-column Computing quantities
 
-Call queryosity::dataflow::define() with the appropriate argument.
+New columns can be computed out of existing ones by calling queryosity::dataflow::define() with the appropriate argument, or operators between the underlying data types.
 
 @cpp
-// -----------------------------------------------------------------------------
-// constant
-// -----------------------------------------------------------------------------
-// their values will not change per-entry
-
+// constants columns do not change per-entry
 auto zero = df.define(column::constant(0));
 auto one = df.define(column::constant(1));
 auto two = df.define(column::constant(2));
 
-// -----------------------------------------------------------------------------
-// simple expression
-// -----------------------------------------------------------------------------
-// binary/unary operators between underlying data types
-
+// binary/unary operators
 auto three = one + two;
-auto v0 = v[zero];
+auto v_0 = v[zero]; 
+// reminder: actions are *lazy*, i.e. no undefined behaviour (yet)
 
-// one += zero; // self-assignment operators are not possible
+// self-assignment operators are not possible
+// one += two; 
+
+// C++ function, functor, lambda, etc. evaluated out of input columns
+// tip: pass large values by const& to prevent copies
+auto s_length = df.define(
+    column::expression([](const std::string &txt) { return txt.length(); }), s);
+@endcpp
 
-// -----------------------------------------------------------------------------
-// custom expression
-// -----------------------------------------------------------------------------
-// (C++ funciton, functor, lambda, etc.) evaluated out of input columns
+A column can also be defined by a custom implementation, which offers:
 
-// pass large values by const reference to prevent expensive copies
-auto s_length = df.define(column::expression([](const std::string& txt){return txt.length();}), s);
+- Customization: user-defined constructor arguments and member variables/functions.
+- Optimization: each input column is provided as a column::observable<T>, which defers its computation for the entry until column::observable<T>::value() is invoked.
 
-// -----------------------------------------------------------------------------
-// custom definition
-// -----------------------------------------------------------------------------
-// the most general & performant way to compute a column
+As an example, consider the following calculation of a factorial via Stirling's approximation:
 
-class VectorSelection : public column::definition<> 
-{
+@cpp
+using ull_t = unsigned long long;
+
+// 1. full calculation
+auto factorial(ull_t n) {
+  ull_t result = 1;
+  while (n > 1)
+    result *= n--;
+  return result;
+}
+// 2. approximation
+auto stirling = []() {
+  return std::round(std::sqrt(2 * M_PI * n) * std::pow(n / std::exp(1), n));
+};
 
+// 1. if n is small enough to for n! to fit inside ull_t, use full calculation
+// 2. if n is large enough, use approximation
+auto n = ds.read(dataset::column<ull_t>("n"));
+auto n_f_fast = df.define(column::expression(stirling), n);
+auto n_f_full = df.define(column::expression(factorial), n);
+ull_t n_threshold = 10;
+
+// using expression
+auto n_f_slow = df.define(column::expression([n_threshold](ull_t n, ull_t fast, ull_t slow){
+  return n >= n_threshold ? n_fast : n_flow; });
+// time elapsed = t(n) + t(fast) + t(slow)
+// :(
+
+// using definition
+class Factorial : public column::definition<double(ull_t, double, double)> {
+public:
+  Factorial(ull_t threshold) : m_threshold(threshold) {}
+  virtual ~Factorial() = default;
+  ull_t evaluate(column::observable<ull_t> n, column::observable<ull_t> fast,
+                 column::observable<ull_t> full) const override {
+    return (n.value() >= m_threshold) ? fast.value() : full.value();
+  }
+  void adjust_threshold(ull_t threshold) {
+    m_threshold = threshold;
+  }
+
+protected:
+  ull_t m_threshold;
 };
-auto v_selected = df.define(column::definition<>(), v);
+auto n_f_best = df.define(column::definition<Factorial>(n_threshold), n, n_f_fast, n_f_full);
+// time elapsed = t(n) + { t(n_fast) if n >= 10, t(n_slow) if n < 10 }
+// :)
+
+// advanced: access per-thread instance
+dataflow::node::invoke([](Factorial* n_f){n_f->adjust_threshold(20);}, n_f_best);
 @endcpp
 
 @see 
 - queryosity::column::definition (API)
   - queryosity::column::definition<Out(Ins...)> (ABC)
+- queryosity::column::observable<T> (API)
 
 @section guide-selection Applying selections
 
 Call queryosity::dataflow::filter() or queryosity::dataflow::weight() to initiate a selection in the cutflow, and apply subsequent selections from existing nodes to compound them. 
 
 @cpp
-// -----------------------------------------------------------------------------
-// initiate a cutflow 
-// -----------------------------------------------------------------------------
+auto [w, cat] = ds.read(dataset::column<double>("weight"), dataset::column<std::string>("category"));
+auto a = df.define(column::constant<std::string>("a"));
+auto b = df.define(column::constant<std::string>("b"));
+auto c = df.define(column::constant<std::string>("c"));
 
-// pass all entries, apply a weight
+// initiate a cutflow 
 auto weighted = df.weight(w);
 
-// -----------------------------------------------------------------------------
-// compounding 
-// -----------------------------------------------------------------------------
 // cuts and weights can be compounded in any order.
-
-// ignore entry if weight is negative
 auto cut = weighted.filter(
   column::expression([](double w){return (w>=0;);}), w
   );
 
-// -----------------------------------------------------------------------------
-// branching out
-// -----------------------------------------------------------------------------
 // applying more than one selection from a node creates a branching point.
-
-auto cat = ds.read<std::string>("cat");
-
-auto a = df.define(column::constant<std::string>("a"));
-auto b = df.define(column::constant<std::string>("b"));
-auto c = df.define(column::constant<std::string>("c"));
-
 auto cut_a = cut.filter(cat == a);
 auto cut_b = cut.filter(cat == b);
 auto cut_c = cut.filter(cat == c);
 
-// -----------------------------------------------------------------------------
-// merging
-// -----------------------------------------------------------------------------
 // selections can be merged based on their decision values.
-
 auto cut_a_and_b = df.filter(cut_a && cut_b);
 auto cut_b_or_c = df.filter(cut_b || cut_c);
 @endcpp
@@ -178,23 +207,22 @@ using h1d = qty::hist::hist<double>;
 using h2d = qty::hist::hist<double,double>;
 using linax = qty::hist::axis::regular;
 
-// instantiate a 1d histogram query filled with x over all entries
-auto q = df.make(query::plan<h1d>(linax(100,0.0,1.0))).fill(x).book(inclusive);
+// plan a 1d/2d histogram query filled with x/(x,y)
+auto q_1d = df.make(query::plan<h1d>(linax(100,0.0,1.0))).fill(x);
+auto q_2d = df.make(query::plan<h2d>(linax(100,0.0,1.0), linax(100,0.0,1.0))).fill(x,y);
 
-// a plan can book multiple queries at a selection
-auto [q_a, q_b] = df.make(query::plan<h1d>(linax(100,0.0,1.0))).fill(x).book(cut_a, cut_b);
+// query at multiple selections
+auto [q_1d_a, q_1d_b] = q_1d.book(cut_a, cut_b);
 
-// a selection can book multiple queries (of different types)
-auto qp_1d = df.make(query::plan<h1d>(linax(100,0.0,1.0))).fill(x);
-auto qp_2d = df.make(query::plan<hist2d>(linax(100,0.0,1.0),linax(100,0.0,1.0))).fill(x,y);
-auto [q_1d, q_2d] = sel.book(qp_1d, qp_2d);
+// multiple (different) queries at a selection
+auto [q_1d_c, q_2d_c] = c.book(q_1d, q_2d);
 @endcpp
 
 Access the result of a query to turn all actions eager.
 
 @cpp
-auto hx_a = q_a.result();  // takes a while -- dataset traversal
-auto hxy_sel = q_2d.result();  // instantaneous -- already completed
+auto hx_a = q_1d_a.result(); // takes a while
+auto hxy_c = q_2d_c.result(); // instantaneous
 @endcpp
 
 @see 
@@ -207,69 +235,93 @@ auto hxy_sel = q_2d.result();  // instantaneous -- already completed
 Call queryosity::dataflow::vary() to create varied columns. 
 There are two ways in which variations can be specified:
 
-1. **Automatic.** Specify a specific type of column to be instantiated along with the nominal+variations. Always ensures the lockstep+transparent propagation of variations.
-2. **Manual.** Provide existing instances of columns to be nominal+variations; any column whose output value type is compatible to that of the nominal can be set as a variation.
+1. **Pre-instantiation.** Provide the nominal argument and a mapping of variation name to alternate arguments.
+2. **Post-instantiation.** Provide existing instances of columns to be the nominal and its variations.
+  - Any column whose data type is compatible with that of the nominal can be set as a variation.
 
 Both approaches can (and should) be used in a dataflow for full control over the creation & propagation of systematic variations.
 
 @cpp
-// automatic -- set and forget
+// pre-instantiation
 
 // dataset columns are varied by different column names
 auto x = ds.vary(
-  dataset::column("x_nom"),
-  {"vary_x","x_var"}
-  );
+  dataset::column<double>("x_nom"),
+  {{"shift_x","x_shifted"}, {"smear_x", "x_smeared"}}
+  );  
+// type = qty::lazy<column::fixed<double>>::varied
+// (column::fixed<dobule> is the concrete type)
 
 // constants are varied by alternate values
-auto yes_or_no = df.vary(
+auto y = df.vary(
   column::constant(true),
-  {"no", false}
+  {{"no", false}}
   );
 
-// expressions are varied by alternate expression+input columns
-auto b = df.vary(
-  column::expression(),
-  systematic::variation("vary_b", )
-  )();
-
-// definitions are varied by alternate constructor arguments+input columns
-auto defn = df.vary(
-  column::definition<>(),
-  systematic::variation("vary_c", )
-  )();
+// expressions are varied by alternate expression and input columns
+auto x_pm_y = df.vary(
+  column::expression([](double x, float y){return x+y;}),
+  {
+    {"minus_y", [](double x, float y){return x-y;}}
+  }
+  )(x, y);
 
-// manual method -- man-handle as you see fit
+// post-instantiation
 
 // note:
 // - different column types (column, constant, definition)
 // - different data types (double, int, float)
 auto z_nom = ds.read(dataset::column<double>("z"));
 auto z_fixed = df.define(column::constant<int>(100.0));
 auto z_half = df.define(column::expression([](float z){return z*0.5;}), z_nom);
-
-// as long as their output values are compatible, any set of columns can be used
 auto z = systematic::vary(
   systematic::nominal(z_nom), 
   systematic::variation("z_fixed", f_fixed), 
   systematic::variation("z_half", z_half)
-  );
+  );  
+// qty::lazy<column::valued<double>>::varied
+// (column::valued<double> is the common denominator base class)
+@endcpp
 
-// systematic variations are propagated through selections...
-auto sel = df.filter(yes_or_no);
 
-// ... and queries
-auto q = df.make(query::plan<>()).fill(x).book(yes_or_no);
-q.has_variation("vary_x"); // true, filled with x_var for all entries
-q.has_variation("no"); // true, filled with x_nom for no entries
+@cpp
+// definitions are varied by alternate constructor arguments+input columns
+// since the constructor arguments of an arbitrary class is not known a-priori,
+// each must be provided on its own, rather than in a mapping.
+auto defn = df.vary(
+  column::definition<>(),
+  systematic::variation("vary_c", )
+  )();
+@endcpp
+
+The list of variations in a (set of) action(s) can be always be checked as they are propagated through the dataflow.
+After the dust settles, the nominal and each varied result of a query can be accessed individually.
+
+@cpp
+// check variations
+x.has_variation("shift_x"); // true
+x.has_variation("smear_x"); // true
+x.has_variation("no"); // false
+
+yes_or_no.has_variation("shift_x"); // false
+yes_or_no.has_variation("smear_x"); // false
+yes_or_no.has_variation("no"); // true
+
+systematic::get_variation_names(x, yes_or_no); // {"shift_x", "smear_x", "no"}
+
+// propagation through selection and query
+auto sel = df.filter(yes_or_no);
+auto q = df.get(column::series(x)).book(sel);
 
-// other variations play no role here.
-q.has_variation("vary_b"); // false
+q.has_variation("shift_x"); // true
+q.has_variation("smear_x"); // true
+q.has_variation("no"); // true
 
-// nominal & varied results can be separately accessed
-auto q_nom_res = q.nominal().result();
-auto q_varx_res = q.["vary_x"].result();
-auto q_none_res = q.["no"].result();
+// access nominal+variation results
+q.nominal().result(); // {x_nom_0, ..., x_nom_N}
+q.["shift_x"].result(); // {x_shifted_0, ..., x_shifted_N}
+q.["smear_x"].result(); // {x_smeared_0, ..., x_smeared_N}
+q.["no"].result(); // {}
 @endcpp
 
 @see