From b6224fd519f95e68d8b93ba90376fd94c8b76e69 Mon Sep 17 00:00:00 2001 From: Niklas Mischkulnig <4586894+mischnic@users.noreply.github.com> Date: Fri, 11 Aug 2023 13:37:42 +0200 Subject: [PATCH] Scopehoisting contributor documentation (#8402) --- docs/Deferring.md | 77 +++++++++ docs/Scopehoisting Packager.md | 63 +++++++ docs/Scopehoisting Transformer.md | 142 ++++++++++++++++ docs/Scopehoisting.md | 270 ++++++++++++++++++++++++++++++ docs/Symbol Propagation.md | 191 +++++++++++++++++++++ docs/swc Visitors.md | 156 +++++++++++++++++ 6 files changed, 899 insertions(+) create mode 100644 docs/Deferring.md create mode 100644 docs/Scopehoisting Packager.md create mode 100644 docs/Scopehoisting Transformer.md create mode 100644 docs/Scopehoisting.md create mode 100644 docs/Symbol Propagation.md create mode 100644 docs/swc Visitors.md diff --git a/docs/Deferring.md b/docs/Deferring.md new file mode 100644 index 00000000000..2a5d37f4292 --- /dev/null +++ b/docs/Deferring.md @@ -0,0 +1,77 @@ +# Deferring Assets + +(The core idea and benefits are described in [Scopehoisting](Scopehoisting.md)). + +Even if the usual way to describe deferring is via dependencies (and this is also how the API exposes it), the entity that's actually getting deferred is the asset group node. This is because the dependency is just the dependency "request" (though not as in "request" graph) and doesn't know yet whether the resolved asset is side-effect free. That is only known after the resolver ran (and the resolver result is stored in the asset group node). + +## Deferring + +This might be the current state of the asset graph during transformation, only the "Button" reexport of the library is used so far, and the other reexport "Switch" wasn't imported anywhere (yet). So the "Switch" asset was deferred. + +The `deferred`/`hasDeferred` properties respond to the asset graph node properties. + +```mermaid +graph TD; + AssetA + -->DependencyLibA[DependencyLibA:Button] + -->AssetGroupLib[AssetGroupLib
hasDeferred] + -->AssetLib[AssetLib
hasDeferred]; + AssetLib + -->DependencyLibButton[DependencyLibButton:Button] + -->AssetGroupLibButton + -->AssetLibButton; + AssetLib + -->DependencyLibSwitch[DependencyLibSwitch:Switch
hasDeferred] + -->AssetGroupLibSwitch[AssetGroupLibSwitch
deferred]; + + classDef asset fill:orange,stroke:orange; + classDef dep fill:lime,stroke:lime; + class AssetA asset; + class AssetLib asset; + class AssetLibButton asset; + class DependencyLibA dep; + class DependencyLibButton dep; + class DependencyLibSwitch dep; +``` + +This is detected in [`assetGraph.shouldVisitChild(DependencyLibSwitch, AssetGroupLibSwitch)`](https://github.com/parcel-bundler/parcel/blob/9e5d05586577e89991ccf90400f2c741dca11aa3/packages/core/core/src/AssetGraph.js#L305) which calls `assetGraph.shouldDeferDependency` (reads the symbol information and determines if the dependency is unused). Then `markParentsWithHasDeferred(DependencyLibSwitch)` is called to add the `hasDeferred=true` flags for the parent asset and asset group nodes. + +Because `shouldVisitChild` returns false, the graph traversal never visits the asset group node and also never transforms the corresponding asset. + +### Undeferring + +Now another dependency is added/discovered during transformation, the asset group should be undeferred and the asset should get transformed: + +```mermaid +graph TD; + AssetA + -->DependencyLibA[DependencyLibA:Button] + -->AssetGroupLib; + AssetB + -->DependencyLibB[DependencyLibB:Switch] + -->AssetGroupLib; + AssetGroupLib[AssetGroupLib
hasDeferred] + -->AssetLib[AssetLib
hasDeferred]; + AssetLib + -->DependencyLibButton[DependencyLibButton:Button] + -->AssetGroupLibButton + -->AssetLibButton; + AssetLib + -->DependencyLibSwitch[DependencyLibSwitch:Switch
hasDeferred] + -->AssetGroupLibSwitch[AssetGroupLibSwitch
deferred]; + AssetGroupLibSwitch + -->AssetLibSwitch + + classDef asset fill:orange,stroke:orange; + classDef dep fill:lime,stroke:lime; + class AssetA,AssetB,AssetLib,AssetLibButton asset; + class DependencyLibA,DependencyLibB,DependencyLibButton,DependencyLibSwitch dep; + style AssetLibSwitch fill:transparent,stroke-dasharray: 5 5,stroke:orange; + linkStyle 10 stroke-dasharray: 5 5,stroke-width: 1.5; +``` + +`DependencyLibB` got added to the graph and now all its children are considered: in the asset graph request traversal's `visitChildren` wrapped, there's [an override to revisit nodes if they have `hasDeferred=true`](https://github.com/parcel-bundler/parcel/blob/9e5d05586577e89991ccf90400f2c741dca11aa3/packages/core/core/src/requests/AssetGraphRequest.js#L169). This causes `AssetLib` and in turn `DependencyLibSwitch` to be revisited. + +`shouldVisitChild` and `shouldDeferDependency` then determine that `AssetLibSwitch` is now used and call `unmarkParentsWithHasDeferred(AssetGroupLibSwitch)` which clears `DependencyLibSwitch.hasDeferred`, clears `AssetLib.hasDeferred` (but only if there is no other sibling dependency that is still deferred), and sets `AssetGroupLib.hasDeferered = AssetLib.hasDeferred`. + +`shouldVisitChild` returns true and `AssetGroupLibSwitch` gets visited for the first time, also transforming the asset and creating the asset node. diff --git a/docs/Scopehoisting Packager.md b/docs/Scopehoisting Packager.md new file mode 100644 index 00000000000..2147ed43289 --- /dev/null +++ b/docs/Scopehoisting Packager.md @@ -0,0 +1,63 @@ +# Scopehoisting Packager - Overview + +(The skipping of single assets is described in [Scopehoisting](Scopehoisting.md)). + +## Starting point `package()`: + +1. `loadAssets()`: Load the assets contents from cache and determine which assets are wrapped. +2. `processAsset()`/`visitAsset()` which call `buildAsset()`: These will recursively resolve dependency specifiers and inline dependencies, and append the result to the top level `res` string. +3. Kick off the process by calling `processAsset()` for all assets (and skip some to only process assets once if it was already inlined somewhere else). + +## `buildAsset()`: + +1. If the asset should be skipped: ignore the current asset, call `buildAsset()` for dependency assets and concatenate only them together. +2. Call `buildReplacements()`, generating the `Map`s used during the text replacement: + - The dependency map which is used to resolve `import "...";` declarations inserted by the transformer: `${assetId}:${specifier}${specifiertype} -> Dependency` + - Import replacements: the local part of a dependency symbol (`$id$import$foo`) -> result of `getSymbolResolution` (e.g. `$id$export$bar` or `parcelRequire("id").bar`) +3. Call `buildAssetPrelude()`: + - generates `$parcel$defineInteropFlag($id$exports)` call for this asset if needed. + - synthesizes the exports object if needed (including generation of the `$parcel$export` and `$parcel$exportWildcard` calls only for used re/exports) +4. Perform the replacements with `REPLACEMENT_RE` matching one of + - `import "id";` + - will be replaced with the source code of the asset (call `buildAsset()` recursively ). If the referenced asset is wrapped, don't inline but place it after the current asset (into `depContent`). + - calls `getHoistedParcelRequires` to read the `hoistedRequires` list from `getSymbolResolution` and prepend needed requires. + - `$id$exports` + - `module.exports` inside the asset gets replaced with `$id$exports` in the transformer, but for wrapped assets, this has to be replaced back to `module.exports` + - `$id$import|importAsync|require$foo` + - will be looked up in the replacements and replaced with the resolved identifier +5. If necessary, wrap the result up until now with `parcelRequire.register("id", ...)`. + +## `getSymbolResolution()`: + +This is a wrapper around `bundleGraph.getSymbolResolution()`. + +The additional dependency argument is used to determine whether CJS interop has to be applied (if it's a ESM import), or whether it's a non-conditional import (and a hoisted `parcelRequire` call has to be generated). + +Compared to the bundle graph's method, the `parentAsset` is used to make wrapped assets using their own namespace object refer to `module.exports` instead of `$id$exports`. + +- It returns the resolved expression for the specified symbol: + - `$id$export$bar` (e.g. same-bundle ESM import), + - `$id$exports` (e.g. same-bundle ESM import), + - `id$exports.bar` (e.g. non statically analyzable exports) or + - `parcelRequire("id").bar` (wrapped/in another bundle) + - `$parcel$interopDefault` (if an ESM default import resolved to a non-statically analyzable CJS asset) +- also handles interop (if the default symbol is imported and the resolved asset is CJS, use the namespace instead) +- tracks imports of wrapped assets (which will need `parcelRequire` call) by mutating the `hoistedRequires` list + +## `bundleGraph.getSymbolResolution()` + +This method transitively/recursively traverses the reexports of the asset to find the specified export. This enables resolving some import to the actual value and not just some reexporting binding. + +The result is an `asset`, the `exportSymbol` string, and `symbol`. The value can be accessed from `$asset.id$exports[exportSymbol]`, which is potentially also already (or only) available via the top-level variable `symbol`. So for the add/square example above, `getSymbolResolution(math.js, "add")` would return `{asset: "math.js", exportSymbol: "add", symbol: "$fa6943ce8a6b29$export$add"}`. + +While this improves code size, an imperfection with this system is that it actually means that an asset A can use a value from asset B (which is usually modelled with a dependency from A to B) without there actually being a dependency between the two. Dependencies are also used to determine if an asset is required from another bundle and has to therefore be registered with `parcelRequiree`. This discrepancy can be handled inside of a single bundle, but not across multiple bundles, so the `boundary` parameter makes the resolution stop once the bundle is left. + +There are three possible resolution results: + +- the export has been found (with top level variable `symbol`). +- the export has not been found (`symbol === undefined`), this should have been caught already by symbol propagation +- the export has been found and is unused (`symbol === false`) +- it had to bailout because there are multiple possibilities (`symbol === null`), and the caller should fallback to `$resolvedAsset$exports[exportsSymbol]`. Some examples for bailouts are: + + - `export * from "./nonstatic-cjs1.js"; export * from "./nonstatic-cjs1.js";`, so the decision between which reexport to follow should happen at runtime. + - if the `resolvedAsset` is a non-static cjs asset itself, then `module.exports[exportsSymbol]` should be used anyway. diff --git a/docs/Scopehoisting Transformer.md b/docs/Scopehoisting Transformer.md new file mode 100644 index 00000000000..5fb4a162547 --- /dev/null +++ b/docs/Scopehoisting Transformer.md @@ -0,0 +1,142 @@ +# Scopehoisting Transformer + +(Be sure to read [swc Visitors](swc%20Visitors.md) beforehand.) + +("Non-static" refers to a variable being used in a way that cannot be optimized, such as `module.exports[someVariable] = 2`, or `import * as x from "..:"; console.log(x[someVariable]);`.) + +The task of the hoist transformer is, in the simplest case, rewriting imports and exports, renaming the uses of the imports. The packager can then detect these `import "id:...";` statements to inline dependencies, replace `$id$import$foo` with the resolved expression, and generate necessary `$parcel$export(..., () => $id$export$b)` statements. + + + +
+ +```js +// a.js +import {b} from './b'; +b(); + +// b.js +export let b = 2; +``` + + + +```js +// a.js +import 'id:./b'; +$id$import$b$b(); + +// b.js +let $id$export$b = 2; +``` + +
+ +While this is rather straight forward for pure ESM, a major source of complexity is having to handle arbitrary CJS while still optimizing as much as possible (non-static `module` accesses, non-top-level `require` calls, ...). + +In addition to the code, it sets the symbols and various meta properties on both the asset and the dependencies: + +- `asset.meta.id`: depending on which transformers run after the JS transformer, the value of `asset.id` will be different in packager from the id used for the various variables like `$id$export$foo`. The current asset id in the JS transformer is therefore stored. +- `asset.meta.hasCJSExports`: true if there is at least one CJS export +- `asset.meta.staticExports`: true if there is at least one CJS export that doesn't follow the pattern `module.exports.foo = ...` +- `asset.meta.shouldWrap`: Some constructs require this asset being wrapped in a `parcelRequire.register` block: top-level returns, non-static uses of `module`, eval, reassigning `module` or `exports` +- `dep.meta.shouldWrap`: this is a conditional require +- `dep.meta.promiseSymbol`: see the "Dynamic Imports" section + +## Detecting non-static CJS imports/exports + +A commonly used pattern is detecting some special case patterns such as top-level `var x = require("...");` or `aNamespaceObject.foo` or top-level `module.exports.foo = ...;` as high up in the visitor functions as possible and not traversing the children at all if there's a match. + +So there is check for static top-level requires in `visit_module`, and if the `visit_expr` visitor is reached for `require("...")`, it is definitely a non-static (and conditional) require. + +The `typeof` visitor doesn't traverse the children if the argument is `module`, so that `typeof module` doesn't count towards the non-static accesses to `module`. + +## Self References + +Because even `module.exports.foo = ...;` statements are detected and turned into symbols just like ESM exports, reading `module.exports` or `module.exports.foo` would naively not cause all of the exports to be preserved nor an namespace object to be generated (because looking at the graph and the symbol data, they are unused). + +So instead, reading `module.exports` is expressed just like it is in ESM: by adding an import to the asset itself with the symbols being used. This is called a "self reference". + +## Identifier Names + +There are names to uniquely identify an import, the actual format doesn't actually matter for the code, as long as its used consistently (Parcel never re-parses these names to retrieve the parts again): + +- `$x$import$y` = Asset with id `x` imported the namespace of the dependency with hashed source `y` +- `$x$import$y$z` = Asset with id `x` imported the hashed export `z` of the dependency with hashed source `y` +- `$x$require$y` = Asset with id `x` required the namespace the dependency with hashed source `y` + +and to unique identify an export: + +- `$x$exports` = The namespace exports object of the asset with id `x` +- `$x$exports$y` = The hashed export `y` of the asset with id `x` + +(The symbol names are hashed because it's possible to have export names that are invalid Javascript identifiers: `module.exports["a b"] = 1;` or `export {x as "a b"}`, or via CSS modules.) + +## Dynamic Imports + +Dynamic imports such as `import("..").then(({ foo }) => log(foo));` will only cause `foo` to be used and not the entire asset. But at runtime, we still need a namespace object from which to access `off`. For this reason, + +```js +import('./other.js').then(({foo}) => log(foo)); +``` + +the dependency: + +``` +{ + promiseSymbol: '$assetId$importAsync$other' + symbols: { + 'foo' => { + local: '$assetId$importAsync$other$90a7f3efeed30595', + } + } +} +``` + +the generated code: + +```js +import 'assetId:21eb38ddd81971f9'; +$assetId$importAsync$other.then(({foo}) => log(foo)); +``` + +So `import()` is replaced by an identifier that isn't actually listed in the symbols (because otherwise a symbol for `*` would prevent removing unused symbols), and this is the identifier stored in `dep.meta.promiseSymbol` which is then used for replacement in the packager. + +## Preceding analysis pass: `Collect` + +[This analysis](https://github.com/parcel-bundler/parcel/blob/9e2d5d0d60d08d65b5ae6cd765c907a8753bbf39/packages/transformers/js/core/src/hoist.rs#L1291) runs is used even without scope-hoisting, to generate symbols in development for deferring. + +- collect which variable refers to an import/export +- find evals, non-static accesses of `module`, `exports`, ..., + +## Actual transformation pass: `Hoist` + +Some of the following steps are skipped when the asset was determined to be wrapped during `Collect` (stored in `self.collect.should_wrap`), since `module` and `exports` will be available in that case anyway and no rewriting has to happen for uses of these. + +[fold_module](https://github.com/parcel-bundler/parcel/blob/9e2d5d0d60d08d65b5ae6cd765c907a8753bbf39/packages/transformers/js/core/src/hoist.rs#L138): + +- match ESM import/export decls + - store in `self.hoisted_import` and `self.reexports`, `self.exported_symbols` + - imports are replaced with `import "...";` + - for exports, just a `var $id$export = xyz` is left, the info what is imported/exported is kept in the maps +- match statically analyzable `var x = require("y");`. + - similarly, the whole statement gets removed and replaced with `import "...";`, + +Then, various replacements happen: + +- [fold_ident](https://github.com/parcel-bundler/parcel/blob/9e2d5d0d60d08d65b5ae6cd765c907a8753bbf39/packages/transformers/js/core/src/hoist.rs#L756) looks up in `collect.imports` whether that identifier refers to an import (this renames expressions that refer to the variable as well as the names of the variable declarations themselves) + +- fold_assign_expr + + - replace `module.exports = ...;` with `$id$exports = ...;` + - replace `module.exports.foo = ...;` with `$id$exports$foo = ...;` and generate a corresponding hoisted `var $id$exports$x;` declaration. + +- fold_expr: + - replace `module.exports.foo` with `$id$export` identifier + - replace `importedNs.foo` with `$id$import$foo` identifier + - replace `require("x").foo` with `$id$import$foo` identifier + - replace `require("x")` with `$id$import` identifier + - replace `import("x")` with `$id$import` identifier + - top-level `this` in ESM -> `undefined` + - top-level `this` in CJS -> `module.exports` + - wrap ESM imports with `(0, ...)` for correct `this` diff --git a/docs/Scopehoisting.md b/docs/Scopehoisting.md new file mode 100644 index 00000000000..5385e8045dc --- /dev/null +++ b/docs/Scopehoisting.md @@ -0,0 +1,270 @@ +# Symbols and Scope Hoisting + +## Concepts + +### Tree Shaking and Scope Hoisting + +Tree shaking refers to the general principle of removing dead code. With a naive browserify-style bundling (= what Parcel does in development builds), exports that are never used in the project are still "used" in a syntactical sense (= not dead), but not in a runtime code coverage sense (= unused). + +Some ways to improve this are: + +- Determine which exports are used, and drop the `export` statement during the build. Then the exported value becomes an unused variable and a minifier can remove it. This is what symbol propagation and the conditional generation of only used `$parcel$export()` calls achieves. + This is also why `/*#__PURE__*/` comments are important: + +```js +function Button() {...} +$parcelRequire(exports, "Button"; () => Button); // was: export { Button }; + +// The export was removed during the build, and the function will be dropped by the minifier: +function Select() {...} +// export {Select}; + +// Without the pure comment, minifiers wouldn't be able to remove the right hand side. +// (Note: Babel/swc add this comment automatically when transpiling JSX) +const MyContext = /*#__PURE__*/ React.createContext(); +// export {MyContext}; +``` + +- Determining used exports covers almost all tree shaking needs, but it would still leave the module registry ("prelude"). + + By concatenating assets into a single scope, the function calls for `parcelRequire("id").foo` can be replaced with a regular variable access `$id$export$foo` (ESM import are live bindings, so accessing an imported value in a function would perform this function call every single time, though it's just an object lookup anyway). And these `parcelRequire.register(() => {...})` wrappers plus `parcelRequire` calls also have some bundle size overhead. + + It can also improve the effectiveness of the minifier, especially regarding function inlining and constant evaluation, but this really depends on the actual code. + + + +
+ +```js +// math.js +export function add(a, b) { + return a + b; +} + +export function square(a) { + return a * a; +} + +// index.js +import {add} from './math'; +console.log(add(2, 3)); +``` + + + +```js +function $fa6943ce8a6b29$export$add(a, b) { + return a + b; +} + +// dead code +function $fa6943ce8a6b29$export$square(a) { + return a * a; +} + +console.log($fa6943ce8a6b29$export$add(2, 3)); +``` + +
+ +### Skipping assets (deferring and skipping during bundling) + +(An asset or a dependency being unused means `getUsedSymbols(asset or dep).size === 0`). + +There are two ways in which assets can be skipped (not included in the output): + +**Subgraph**: if a reexport is unused, then the whole subgraph of that dependency can be ignored. This system is built into core because this should be safe in any case. + +- _Deferring_: This can happen during the graph visit when building the asset graph. There is effectively a one-reexports-level lookahead, so if an reexports some symbol `x` and no incoming dependency requests `x`, then the reexport (and the corresponding dependency) is skipped. This doesn't work for `export *`. + + Another benefit of deferring is that deferred assets don't get transformed in the first place. So something like `import {Button} from "design-system";` would only process that single `export {Button} from "./button";` and completely ignore all other exports in `design-system/index.js`. + + Deferring can also happen without scopehoisting (as the non-scopehoisting JS transformer also sets symbols). + +- _Unused dependency_: This is the same principle as deferring, but for an unlimited reexport depth and also for `export *`. Instead of checking the incoming dependencies and matching with non-star reexports, `bundleGraph.getUsedSymbols(dep).size === 0` is used (this information comes from symbol propagation). + + Symbol propagation currently only runs when scope hoisting is enabled. + +**Single Asset**: if a side-effect free asset only has reexports and doesn't export a value itself (and is also not imported from other bundles), then it can be skipped since the reexports will be resolved to their original assets anyway. This is handled in the JS packager only, and not in core. + +```js +import {a} from './lib.js'; +console.log(a); + +// lib.js, asset gets skipped +export * from './exports-a.js'; // dep used, not skipped +export * from './exports-b.js'; // dep skipped with symbol propagation +export {c} from './exports-c.js'; // dep skipped with deferring +``` + +### Symbols + +Both assets and dependencies have attached symbol information. These are maps that describe what an asset (re)exports, and what a dependency imports/reexports. + +Core (so symbol propagation and `getSymbolResolution`) rely on the following convention (plugins can store custom information in the per-symbol meta properties): + +- `asset.symbols` is a map of export names (= what the export was called in the source) to the local names (whatever Parcel renamed the variable to, e.g. `$id$export$foo`). `*` represents the namespace object and is only set for CJS assets (which makes `getSymbolResolution` fall back to a property access). + +- `dependency.symbols` is a map of import names (= which binding was imported) to the local name (= the identifier that the imported binding got replaced by, e.g. `$id$import$bar`). The whole namespace can be imported by using `*` as the import name. A dependency with a `* -> *` mapping corresponds to `export * from`. + +All CommonJS assets have a `* -> $id$exports` symbol, which serves as a fallback when importing a symbol that is not explicitly listed. This is also what prevents symbol propagation from throwing a `some-commonjs.js does not export foo` error, as this can't be done reliably for CommonJS assets (e.g. assets can be added from outside the asset). + +`module.exports = ...;` or some other non-statically analyzable syntax like accessing `module` freely causes the asset to have a `*`. + +These two types of mapping can be used together to model reexports: + +- `export {a as b} from "x";` is turned into a `a -> $id$import$x$a` mapping on the dependency and a `b -> $id$import$x$a` mapping on the asset. +- `export * as a from "x";` is turned into a `* -> $id$import$x` mapping on the dependency and a `a -> $id$import$x` mapping on the asset. +- (`export *` just have that `* -> *` on the dependency) + +Examples: + + + + + + +
+ +```js +export const foo = 2; +``` + + + +``` +asset.symbols = { + foo -> $assetId$export$a829fe +} +``` + +
+ +```js +import {foo} from './other.js'; +``` + + + +``` +dependencies["./other.js"].symbols = { + foo -> $assetId$import$8128f$281fa (isWeak: false) +} +``` + +
+ +```js +export {foo as bar} from './other.js'; +``` + + + +``` +asset.symbols = { + bar -> $assetId$import$8128f$281fa +} +dependencies["./other.js"].symbols = { + foo -> $assetId$import$8128f$281fa (isWeak: true) +} +``` + +
+ +```js +export * from './other.js'; +``` + + + +``` +asset.symbols = {} +dependencies["./other.js"].symbols = { + * -> * (isWeak: true) +} +``` + +
+ +#### Used Symbols + +The used symbols are determined by symbol propagation, and have slightly different meanings for dependencies and assets: + +- `getUsedSymbols(asset)` is the set of symbols that were resolved to this specific asset (so excluding eventual reexports). +- `getUsedSymbols(dependency)` is the set of symbols that are imported through the dependency (so both including reexports). So for an `export {a, b} from "...";` it is a subset of `a,b` and for `export * from "...";` it is the set of symbols that are actually resolved through that reexport. + +### Integrating ESM and CJS with parcelRequire: Circular Imports and Conditional Requires + +#### ESM + +The ES module system behaves exactly like assets getting concatenated and imports getting resolved to their actual bindings. + +```js +// index.js +import {func} from './other.js'; +func(); // ReferenceError: Cannot access 'value' before initialization +export const value = 1; + +// other.js +import {value} from './index.js'; +export function func() { + return value + 1; +} +``` + +If `value` were instead a function, calling it would work correctly (functions are still hoisted). So circular imports are why `$parcel$export` calls also have to be hoisted to the top of the asset. + +#### Limitations + +The reasons why the `parcelRequire` registry is needed are assets being accessed from other bundles (and potentially being duplicated), and conditional requires (which are impossible with pure ESM declarations). + +So assets that have at least one conditional incoming dependency or are used by some other bundle, are wrapped in a `parcelRequire.register`. `require("foo")` calls inside ifs or functions are replaced with the appropriate `parcelRequire("id")` call. + +But since the whole subgraph is conditionally executed, all assets have to be wrapped and inside of that subgraph, imports cannot be replaced with the top level variables anymore, but instead get replaced with the CommonJS equivalent (so `var $id = parcelRequire("id");` and then `$id.foo`) which also runs the side effects. + +### Runtime Deduplication + +One part of scope hoisting is getting rid of the registry that is used in development/browserify, but the registry is unfortunately still needed whenever an asset is included in multiple bundles. This ensures that an asset is only ever evaluated at most once, so that side-effects don't run twice, and that the identity of the exports is retained: + +```js +// index.js +const a = await import("./async1.js"); +const b = await import("./async2.js"); +console.log(a.constructor === b.constructor) // or using `instanceof`, ... + +// async1.js (becomes an async bundle together with a copy of "lib") +import {SomeClass} from 'lib'; +export default new SomeClass(); + +// async2.js (becomes an async bundle together with a copy of "lib") +import {SomeClass} from 'lib'; +export default new SomeClass(); +``` + +### Interop + +The usual way for importing CommonJS using synchronous ESM imports is via an default export, which then contains the exports namespace object of the CommonJS asset. + +```js +import v from './other'; +// v == { x: 2, y: 3 } + +// other.js +module.exports.x = 2; +module.exports.y = 3; +``` + +But by convention (with Babel, tsc, ...), this should not happen if the imported asset is actually an ESM file that was transpiled to CommonJS beforehand (and e.g. published to npm). In that case, the default import should refer to original default export (which was transpiled to `exports.default = ...;`). Without interop, the default export would be `{ default: ... }`. + +So instead, transpilers add an additional "export" with `exports.__esModule = true;` which declares a file to be ESM-transpiled-to-CommonJS. + +An asset with a default import of a (maybe)-CommonJS file now needs to do a lookup: + +```js +function interopRequireDefault(obj) { + return obj && obj.__esModule ? obj : {default: obj}; +} +var _x = interopRequireDefault(require('./x')); +``` + +With scope hoisting, Parcel can omit this call in many cases when the importee was determined to be ESM or ESM-transpiled-to-CommonJS via static analysis. diff --git a/docs/Symbol Propagation.md b/docs/Symbol Propagation.md new file mode 100644 index 00000000000..dc7d29e0761 --- /dev/null +++ b/docs/Symbol Propagation.md @@ -0,0 +1,191 @@ +# Symbol Propagation + +The goal of symbol propagation is generating the sets of (transitively) used symbols based on the symbols set on assets and dependencies (see [Scopehoisting](Scopehoisting.md)). + +## Two Passes + +In the most basic case, the used symbols can be determined by traversing through all assets, and repeatedly forwarding the symbols of incoming dependencies down through the reexports corresponding to outgoing dependencies (and always matching them to the correct reexport, potentially also renaming the symbol). + +But with `export *`, there is no unique reexport to match an incoming symbol request to: + +```js +// index.js +import {a} from './other.js'; + +// other.js +export * from './x.js'; // Is `a` reexported here... +export * from './y.js'; // ... or here? Or neither? + +// x.js +export const a = 1; +// y.js +export const b = 2; +``` + +Instead, there are two passes: + +- in the first ("down") pass, the incoming used symbols are matched to the correct reexport (if there is one), or to _all_ `export *`. So after this pass, the symbol will be marked as used in all potentially relevant dependencies (one of which will be the correct one). + +- in the second ("up") pass, the set of requested symbols (from the down pass) is intersected with the set of actual exports and copied back from the outgoing dependencies to the incoming dependencies. There are multiple cases that can occur: + - There is exactly one dependency that can provide the export. + - There is no dependency that can provide the export, which leads to a ["x does not export y" error](https://github.com/parcel-bundler/parcel/blob/f65889ebd768e9b2e146537b47d4d5d82ff177b8/packages/core/core/src/requests/AssetGraphRequest.js#L754-L776). + - (For `export *`:) There are multiple dependencies that can provide the export. This can happen with valid ESM (and the first value will be used), or with non-statically analyzable CJS modules where we have to determine at runtime which value to use. [There's a verbose warning in this case](https://github.com/parcel-bundler/parcel/blob/f65889ebd768e9b2e146537b47d4d5d82ff177b8/packages/core/core/src/requests/AssetGraphRequest.js#L560-L569). + + + + + + + +
Data Flow in Down TraversalData Flow in Up Traversal
+ +```mermaid +graph TD; + DepIn1[Incoming dep] --> Asset; + DepIn2[Incoming dep] --> Asset; + DepIn3[Incoming dep] --> Asset; + Asset; + Asset --> DepOut1[Outgoing dep]; + Asset --> DepOut2[Outgoing dep]; + Asset --> DepOut3[Outgoing dep]; + + classDef asset fill:orange,stroke:orange; + classDef dep fill:lime,stroke:lime; + class Asset asset; + class DepIn1,DepIn2,DepIn3,DepOut1,DepOut2,DepOut3 dep; +``` + + + +```mermaid +graph BT; + Asset --> DepIn1[Incoming dep]; + Asset --> DepIn2[Incoming dep]; + Asset --> DepIn3[Incoming dep]; + Asset; + DepOut1[Outgoing dep] --> Asset; + DepOut2[Outgoing dep] --> Asset; + DepOut3[Outgoing dep] --> Asset; + + classDef asset fill:orange,stroke:orange; + classDef dep fill:lime,stroke:lime; + class Asset asset; + class DepIn1,DepIn2,DepIn3,DepOut1,DepOut2,DepOut3 dep; +``` + +
After Down TraversalAfter Up Traversal
+ +```js +// index.js +import {a} from './other.js'; // used down: a + +// other.js, used down: +export * from './x.js'; // used down: a +export * from './y.js'; // used down: a (!) + +// x.js, used down: a +export const a = 1; +// y.js, used down: +export const b = 2; +``` + + + +```js +// index.js +import {a} from './other.js'; // used down: a, used up: a + +// other.js, used down: +export * from './x.js'; // used down: a, used up: a +export * from './y.js'; // used down: a, used up: + +// x.js, used: a +export const a = 1; +// y.js, used: +export const b = 2; +``` + +
+ +This is why `DependencyNode#usedSymbolsUp` are the actual used symbols, and `DependencyNode#usedSymbolsDown` is just an implementation detail. + +## Circular Imports/Reexports + +In both cases, circular reexports also have to be considered: + +```js +// index.js +import {b} from './other.js'; +export const a = 1; +console.log(b); + +// other.js +export {a as b} from './index.js'; +``` + +This traversal logic is abstracted away into the `propagateSymbolsDown` and `propagateSymbolsUp` methods in [AssetGraphRequest.js](../packages/core/core/src/requests/AssetGraphRequest.js), while the visitor function that handles the actual symbol data is passed as a visitor callback. + +The down pass (`propagateSymbolsDown(...)`) performs a queue-based BFS which will continue re-traversing parts of the graph if they are marked dirty (the flags are described in the next sections). The traversal starts with the changed assets and asset groups that had incoming edges removed (which can lead to less used symbols). + +The up pass (`propagateSymbolsUp(...)`) is a post-order recursive DFS if more than half of all assets changed, or a queue-based "reverse" BFS (which covers both incremental updates and circular imports by traversing until nothing is marked dirty anymore). + +Another result of having to handle circular imports is error handling. Error cannot be thrown immediately when a missing reexport is encountered, as this situation also occurs when walking dependency cycles (and continuing just a bit longer might cause an outgoing dependency to get updated with the export that was previously missing): + +The traversal starts at the leaves (other.js): the incoming `index -> other (down: b)` dependency requests b, but the outgoing `other -> index` dependency which provides `b` doesn't have `a` in the up set yet. So an error is stored for the other.js asset. + +Next, index.js is visited, and the incoming dependency `other -> index (down: a)` can indeed be satisfied with the export and is therefore added into `usedSymbolsUp`. + +Because `other -> index` was changed, the parent asset is revisited (other.js), and the incoming `index -> other (down: b)` request for `b` is again matched to `a` in the reexport, and the outgoing dependency does indeed have `a` in `outgoing.usedSymbolsUp` so `b` is added to `incoming.usedSymbolsUp`. Since no error was generated when processing other.js, the error from the previous iteration gets discarded. + +| up iteration | `import {b}` | `export {a as b}` | +| --------------------- | ---------------------- | -------------------------------------------------------------------------- | +| start | `usedSymbolsDown`: `b` | `usedSymbolsDown`: `a` | +| 1, asset=other.js | | Error: `a` not found in `usedSymbolsUp` of outgoing dep (the `import {b}`) | +| 2, asset=index.js | | `usedSymbolsUp`: `a` | +| **3, asset=other.js** | `usedSymbolsUp`: `b` | `usedSymbolsUp`: `a` | +| 3, asset = index.js | `usedSymbolsUp`: `b` | `usedSymbolsUp`: `a` | + +Furthermore, it's possible that there are multiple errors for missing exports. But it's hard to determine which of these have the same cause (because in such a circular reexporting chain, all nodes would show an error, there is no "first" one), so it simply throws the first error in the list. This has the downside of only showing one error at a time even if the code contains multiple (unrelated) problematic imports in different assets. + +## Down Traversal + +1. Categorize incoming usedSymbols into `asset.usedSymbols` or the `namespaceReexportedSymbols` based on whether they're listed in `asset.symbols`. So `asset.usedSymbols` now also contains reexports, which will be removed again in the next step. +2. If the asset has no side effects and also nothing is requested by the incoming dependencies, then the entire subgraph is unused and no symbol will be used. + + Otherwise the `namespaceReexportedSymbols` are redistributed to the `export *` dependencies' `usedSymbolsDown` , and the contents of `asset.usedSymbols` are forwarded to individual dependencies' `usedSymbolsDown` where possible (= a reexport = where there's a symbol in the dependency matching the one in the asset symbols). + +If some outgoing dependency was changed by these steps, it's marked as dirty: + +- `dep.usedSymbolsDownDirty = true` so that the dependency resolution will be revisited later in this traversal. +- `dep.usedSymbolsUpDirtyDown = true` so that the up traversal knows that this dependency now requests some other set of symbols, which could potentially require updates or throw an error. + +## Up Traversal + +1. Go through outgoing dependencies and collect all symbols that are reexported from them in `usedSymbolsUp`. +2. Go through incoming dependencies: + + 1. Each symbol that is requested by that dependency (`usedSymbolsDown`) has to be exported by the asset or reexported by some outgoing dependency. + + If a requested symbol cannot be found as a (re)export, then an error "x.js does not export y" is generated. (Unless the incoming dependency is a `export *`, then throwing here would could cause an error for a symbol that was only speculatively added as described above.) + + 2. If an incoming dependency is completely unused and also side-effect free, then it's marked as excluded (and now behaves like a deferred dependency). + +If some incoming dependency was changed by these steps, it's marked as dirty: + +- `dep.usedSymbolsUpDirtyUp = true` so that the asset which has `dep` as an outgoing dependency will be revisited. + +## [Storing dependency resolution in the used symbols](https://github.com/parcel-bundler/parcel/pull/8432) + +For better codesplitting and to hide the fact that symbol propagation runs on the whole project and not per bundle, dependencies are split to have one symbol per dependency and then retargeted to the asset that this symbol is originally exported from. + +To compute this information of where a symbol resolves to, the up traversal doesn't work with sets of symbols anymore but with a [map that lists the symbols together with the asset that symbol resolves to](https://github.com/parcel-bundler/parcel/blob/f65889ebd768e9b2e146537b47d4d5d82ff177b8/packages/core/core/src/types.js#L324-L333). The rules are: + +- if a symbol is exported directly (not reexported), then this is stored as the map entry value. +- if a symbol is reexported and the reexporting asset is side-effect free and the symbol unambigously resolves to a single reexport, then the map entry value is just copied over from the outgoing dependency (and still points to the original asset). +- otherwise if a symbol is reexported, then the map entry value is instead the reexporting asset (which will lead to a correct lookup at runtime, as described [above](#two-passes)). + +The rewriting of dependencies then happens in [`BundleGraph.fromAssetGraph`.](https://github.com/parcel-bundler/parcel/blob/6211c79f30e8fe2a1e079339277f11dba4acab2c/packages/core/core/src/BundleGraph.js#L210-L226) + +## (Optional side note: Data Flow Analysis) + +In the big picture, there are some parallels here with [data-flow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) (which powers unused variable detection or hoisting of common sub-expressions in compilers): a graph is traversed (and parts are revisited) and some local operation is performed for the current node until the node values stabilize (a fixpoint is reached). diff --git a/docs/swc Visitors.md b/docs/swc Visitors.md new file mode 100644 index 00000000000..208b18a00b0 --- /dev/null +++ b/docs/swc Visitors.md @@ -0,0 +1,156 @@ +# swc Visitors + +> If you're reading this and want to use swc for something other than the existing Parcel transformer, then you might find this template useful which contains all of the boilerplate to parse some input, give you the AST to work with, and finally also stringify it again: https://github.com/mischnic/swc-example + +An swc visitor is a Rust struct that implements the Visit/Fold/VisitMut trait. Then you can take some AST node (e.g. the top-level `module`) and call `visit_with`: + +```rust +struct Foo { + some_state: Vec +} + +impl Visit for Foo { + // Default implementation for all other nodes: + // fn visit_module(&mut self, node: &Ident) { + // node.visit_children_with(self); + // } + fn visit_expr(&mut self, node: &Expr) { + println!("Some expression!"); + node.visit_children_with(self); + } + fn visit_ident(&mut self, node: &Ident) { + self.some_state.push(node.sym); + } +} + +func main(){ + // ... + let myVisitor = Foo { some_state: vec![] }; + module.visit_with(&mut myVisitor); + // ... +} +``` + +If a function for some node type isn't declared, the default implementation uses `visit_children_with` to then visit all respective child nodes (and eventually also the functions declared in the impl). + +Similarly, overriding such a function but not calling `visit_*` on some child nodes explicitly will then not visit the subtree at all (this really is a straight-forward recursive traversal). + +These are all of the types of visitors (at least the ones used by Parcel): + +- `Visit` (and then `visit_with`/`visit_children_with`): + + The function signatures are `fn visit_expr(&mut self, node: &Expr)`, so you get an immutable reference. This is useful for doing some analysis and no changes. + +- `Fold` (and then `fold_with`/`fold_children_with`): + + The function signatures are `fn visit_expr(&mut self, node: Expr) -> Expr`, so you get the value and not just a reference to the node and also have to return the same node type again. + +- `VisitMut` (and then `visit_mut_with`/`visit_mut_children_with`) + + The function signatures are `fn visit_mut_ident(&mut self, node: &mut Expr)` (so you get the value and not just a reference to the node), and also have to return the same node type again. + + At least in theory, this is faster than `Fold` because `Fold` has to copy the node values around even if nothing changes. + +## Removing a node or replacing with a different node type + +Let's try to replace `export function Foo(){}` with `function Foo(){}` in swc: first of all it's not possible to straight-up return a `VarDecl` in a `fn fold_export_decl(self, node: ExportDecl)`. + +Instead, this logic need to be pulled up one level to the `ModuleItem`s + +```rust +fn fold_module_item(&mut self, node: ModuleItem) -> ModuleItem { + match node { + ModuleItem::ModuleDecl(ModuleDecl::ExportDecl(ExportDecl { + decl: func @ Decl::Fn(_), + .. + })) => { + return ModuleItem::Stmt(Stmt::Decl(func)); + }, + _ => { + return node; + }, + } +} +``` + +It's also not possible to return multiple nodes, so to add or remove nodes (be it statements, variable declarators), also visit the parent and access the array of children (`body` in the case of the module). + +```rust +fn fold_module(&mut self, node: Module) -> Module { + let mut res = node.fold_children_with(self); + if let Some(foo) = self.something { + res.body.insert(0, ast::ModuleItem::Stmt(foo)); + } + res +} +``` + +## Identifiers/scopes + +The type used to represent identifiers is `JsWord` (as opposed to a regular `String` or `&str`). This is a special interned string, to construct it for an arbitrary string, you can use `.into()`. Strings that are part of the hard-coded [list of interned words](https://github.com/swc-project/swc/blob/a8748a9191a249fd2a97207cbcf0c3317b1bc1e3/crates/swc_atoms/words.txt#L1) can be retrieved more efficiently by using the `js_word!` macros, trying to use that macro with a string that is not part of the list results in a compile time error. + +```rust + let x: JsWord = "something".into(); + let y: JsWord = js_word!("require") // or "URL", "default", "eval", ... + + let ident: Ident; // the ast node + ident.sym // the JsWord "string" + ident.span.ctxt // the syntax context +``` + +`@babel/traverse` has the concept of scopes to determine if variables refer to the same binding. swc uses `SyntaxContext` (which is internally just a unique number), with the idea that the pair `(JsWord, SyntaxContext)` refers to the unique (and correct) variable binding (even if there are some `Ident` nodes that have the same string). swc's `hygiene()` visitor then "flushes" this information by renaming identifiers to have unique names (which has to happen before codegen because the actual, textual Javascript format of course doesn't know about syntax contexts). + +That means to store a list of variable bindings somewhere (e.g. if you know that it's a top level binding and want to store that information), instead of just `JsWord`, the pair of `(JsWord, SyntaxContext)` should be used. (swc has a type for this pair: `Id`, and `ident.to_id()` is a useful helper here). + +## `visit_ident` + +Node that the `Ident` visitor function has be used with caution as these nodes are not just identifiers that refer to a variable binding, but any kind of "name" in the AST (e.g. when destructuring, for member accesses, private class variables). + +```rust +fn fold_ident(&mut self, node: &Ident) -> Ident { + Ident::new("foo".into(), DUMMY_SP) +} +// and the other visit, visit_mut variants... +``` + +will result in + +```js +function foo(foo) { + foo.foo(foo); +} +const foo = {foo: foo}; +class foo { + #foo; + foo() { + foo(this.#foo); + } +} +``` + +## Logic based on existence of an ancestor + +In some cases, it's only necessary to know whether there exists some parent node matching a condition, but it doesn't have to be actually read or modified (e.g. when replacing `this` but only when not inside a function). + +A pattern for this use case is a variable on the struct that is modified when this condition changes. + +```rust +struct Foo { + in_function_scope: bool, +} + +impl Visit for Foo { + fn visit_function(&mut self, node: &Function) { + let old = self.in_function_scope; + self.in_function_scope = true; + node.visit_children_with(self); + self.in_function_scope = old; + } + + fn visit_expr(&mut self, node: &Expr) { + if let Expr::This(_this) = node { + println!(self.in_function_scope); + } + } +} +```