diff --git a/docs/f-21-12/notebooks/geom_violin.ipynb b/docs/f-21-12/notebooks/geom_violin.ipynb index c1f244d0778..470d205d035 100644 --- a/docs/f-21-12/notebooks/geom_violin.ipynb +++ b/docs/f-21-12/notebooks/geom_violin.ipynb @@ -12,7 +12,7 @@ " \n", " \n", @@ -24,7 +24,6 @@ } ], "source": [ - "import numpy as np\n", "import pandas as pd\n", "\n", "from lets_plot import *\n", @@ -33,113 +32,32 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 2, "metadata": {}, + "outputs": [], "source": [ - "## Test datasets" + "DRAW_QUANTILES = [.25, .5, .75]" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", - "
" - ], - "text/plain": [ - " sepal_length sepal_width petal_length petal_width species\n", - "0 5.1 3.5 1.4 0.2 setosa\n", - "1 4.9 3.0 1.4 0.2 setosa\n", - "2 4.7 3.2 1.3 0.2 setosa\n", - "3 4.6 3.1 1.5 0.2 setosa\n", - "4 5.0 3.6 1.4 0.2 setosa" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "iris_df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/iris.csv\")\n", - "\n", - "iris_df.head()" + "def plot_matrix(plots=[], width=400, height=300, columns=2):\n", + " bunch = GGBunch()\n", + " for i in range(len(plots)):\n", + " row = int(i / columns)\n", + " column = i % columns\n", + " bunch.add_plot(plots[i], column * width, row * height, width, height)\n", + " return bunch.show()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -163,307 +81,361 @@ " \n", " \n", " \n", - " species\n", - " sepal_length\n", - " weight\n", + " Unnamed: 0\n", + " manufacturer\n", + " model\n", + " displ\n", + " year\n", + " cyl\n", + " trans\n", + " drv\n", + " cty\n", + " hwy\n", + " fl\n", + " class\n", " \n", " \n", " \n", " \n", " 0\n", - " setosa\n", - " 4.300000\n", - " 0.222676\n", + " 1\n", + " audi\n", + " a4\n", + " 1.8\n", + " 1999\n", + " 4\n", + " auto(l5)\n", + " f\n", + " 18\n", + " 29\n", + " p\n", + " compact\n", " \n", " \n", " 1\n", - " setosa\n", - " 4.302935\n", - " 0.228662\n", + " 2\n", + " audi\n", + " a4\n", + " 1.8\n", + " 1999\n", + " 4\n", + " manual(m5)\n", + " f\n", + " 21\n", + " 29\n", + " p\n", + " compact\n", " \n", " \n", " 2\n", - " setosa\n", - " 4.305871\n", - " 0.234639\n", + " 3\n", + " audi\n", + " a4\n", + " 2.0\n", + " 2008\n", + " 4\n", + " manual(m6)\n", + " f\n", + " 20\n", + " 31\n", + " p\n", + " compact\n", " \n", " \n", " 3\n", - " setosa\n", - " 4.308806\n", - " 0.240684\n", + " 4\n", + " audi\n", + " a4\n", + " 2.0\n", + " 2008\n", + " 4\n", + " auto(av)\n", + " f\n", + " 21\n", + " 30\n", + " p\n", + " compact\n", " \n", " \n", " 4\n", - " setosa\n", - " 4.311742\n", - " 0.246886\n", + " 5\n", + " audi\n", + " a4\n", + " 2.8\n", + " 1999\n", + " 6\n", + " auto(l5)\n", + " f\n", + " 16\n", + " 26\n", + " p\n", + " compact\n", " \n", " \n", "\n", "" ], "text/plain": [ - " species sepal_length weight\n", - "0 setosa 4.300000 0.222676\n", - "1 setosa 4.302935 0.228662\n", - "2 setosa 4.305871 0.234639\n", - "3 setosa 4.308806 0.240684\n", - "4 setosa 4.311742 0.246886" + " Unnamed: 0 manufacturer model displ year cyl trans drv cty hwy \\\n", + "0 1 audi a4 1.8 1999 4 auto(l5) f 18 29 \n", + "1 2 audi a4 1.8 1999 4 manual(m5) f 21 29 \n", + "2 3 audi a4 2.0 2008 4 manual(m6) f 20 31 \n", + "3 4 audi a4 2.0 2008 4 auto(av) f 21 30 \n", + "4 5 audi a4 2.8 1999 6 auto(l5) f 16 26 \n", + "\n", + " fl class \n", + "0 p compact \n", + "1 p compact \n", + "2 p compact \n", + "3 p compact \n", + "4 p compact " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "def construct_violin_df(df, xname, yname, n=512):\n", - " from functools import reduce\n", - "\n", - " from scipy.stats import gaussian_kde\n", + "mpg_df = pd.read_csv(\"https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv\")\n", "\n", - " def get_weights(values):\n", - " def nrd0_bw(kde):\n", - " iqr = np.quantile(kde.dataset, .75) - np.quantile(kde.dataset, .25)\n", - " std = np.std(kde.dataset)\n", - " size = kde.dataset.size\n", - " if iqr > 0:\n", - " return .9 * min(std, iqr / 1.34) * (size ** -.2)\n", - " if std > 0:\n", - " return .9 * std * (size ** -.2)\n", - "\n", - " yrange = np.linspace(values.min(), values.max(), n)\n", - "\n", - " return {yname: yrange, 'weight': gaussian_kde(values, bw_method=nrd0_bw)(yrange)}\n", - "\n", - " def reducer(agg_df, xval):\n", - " weights = get_weights(df[df[xname] == xval][yname])\n", - " y = weights[yname]\n", - " x = [xval] * y.size\n", - " w = weights['weight']\n", - "\n", - " return pd.concat([agg_df, pd.DataFrame({xname: x, yname: y, 'weight': w})], ignore_index=True)\n", - "\n", - " return reduce(reducer, df[xname], pd.DataFrame(columns=[xname, yname, 'weight']))\n", - "\n", - "violin_df = construct_violin_df(iris_df, 'species', 'sepal_length')\n", - "violin_df.head()" + "mpg_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Minimalistic example" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vc1c2
00.496714Ab
1-0.138264Bb
20.647689Aa
31.523030Aa
4-0.234153Ca
\n", - "
" + "
\n", + " " ], "text/plain": [ - " v c1 c2\n", - "0 0.496714 A b\n", - "1 -0.138264 B b\n", - "2 0.647689 A a\n", - "3 1.523030 A a\n", - "4 -0.234153 C a" + "" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "size = 100\n", - "np.random.seed(42)\n", - "random_df = pd.DataFrame({\n", - " 'v': np.random.normal(size=size),\n", - " 'c1': np.random.choice(['A', 'B', 'C'], size=size),\n", - " 'c2': np.random.choice(['a', 'b'], size=size)\n", - "})\n", - "\n", - "random_df.head()" + "ggplot(mpg_df, aes(y='hwy')) + geom_violin() + ggtitle(\"Simplest example\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparison of geoms" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
vc1c2
00.496714Ab
1-0.138264NaNb
2NaNAa
31.523030ANaN
4-0.234153Ca
\n", - "
" - ], - "text/plain": [ - " v c1 c2\n", - "0 0.496714 A b\n", - "1 -0.138264 NaN b\n", - "2 NaN A a\n", - "3 1.523030 A NaN\n", - "4 -0.234153 C a" + "
\n", + " " ] }, - "execution_count": 5, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "def mask(p=.1, seed=42):\n", - " np.random.seed(seed)\n", - " return np.random.choice([True, False], random_df.shape[0], p=[p, 1 - p])\n", - "\n", - "nullable_df = random_df.copy()\n", - "nullable_df.loc[mask(seed=1), 'v'] = np.nan\n", - "nullable_df.loc[mask(seed=2), 'c1'] = np.nan\n", - "nullable_df.loc[mask(seed=6), 'c2'] = np.nan\n", + "p_d = ggplot(mpg_df) + \\\n", + " geom_density(aes(x='hwy', fill='drv'), color='black', alpha=.5) + \\\n", + " facet_grid(x='drv') + \\\n", + " coord_flip() + \\\n", + " ggtitle(\"geom_density()\")\n", + "p_v = ggplot(mpg_df, aes(x=as_discrete('drv', order=1), y='hwy')) + \\\n", + " geom_violin(aes(fill='drv'), alpha=.5) + \\\n", + " ggtitle(\"geom_violin()\")\n", "\n", - "nullable_df.head()" + "plot_matrix([p_d, p_v])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Minimalistic example" + "## Original parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `draw_quantiles`" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(random_df, aes(y='v')) + geom_violin() + ggtitle(\"Simplest example\")" + "tests = [\n", + " {'draw_quantiles': None}, # default\n", + " {'draw_quantiles': [.05, .5, .95]}, # all correct\n", + " {'draw_quantiles': (1/3, .5, 2/3)}, # strange, but correct\n", + " {'draw_quantiles': [.25]}, # only one\n", + " {'draw_quantiles': []}, # empty\n", + " {'draw_quantiles': [0, .5, 1]}, # include borders\n", + " {'draw_quantiles': [-1, .5, 2], 'skip': True}, # beyond borders\n", + " {'draw_quantiles': ['0.25', '0.5', '0.75'], 'skip': True}, # invalid values\n", + " {'draw_quantiles': [True, False], 'skip': True}, # totally invalid values\n", + " {'draw_quantiles': 0.5, 'skip': True}, # wrong parameter type\n", + " {'draw_quantiles': True, 'skip': True}, # another wrong parameter type\n", + " {'draw_quantiles': '0.25', 'skip': True}, # even worse parameter type\n", + " {'draw_quantiles': object(), 'skip': True}, # totally wrong parameter type\n", + "]\n", + "\n", + "ggplot(mpg_df, aes('drv', 'hwy')) + \\\n", + " geom_violin(draw_quantiles=DRAW_QUANTILES) + \\\n", + " ggtitle(\"draw_quantiles={0}\".format(DRAW_QUANTILES))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Comparison of geoms" + "### `scale`" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(random_df, aes(x='c1', y='v')) + \\\n", - " geom_violin(aes(fill='c2'), tooltips=layer_tooltips().line('^x')\n", - " .line('category|@c2')\n", - " .line('v|@v')\n", - " .line('@|@..density..')\n", - " .line('count|@..count..')\n", - " .line('scaled|@..scaled..')) + \\\n", + "ggplot(mpg_df, aes(x='drv', y='hwy')) + \\\n", + " geom_violin(aes(group='year', fill=as_discrete('year')), \\\n", + " draw_quantiles=DRAW_QUANTILES, \\\n", + " tooltips=layer_tooltips().line('^x')\n", + " .line('year|@year')\n", + " .line('hwy|@hwy')\n", + " .line('violinwidth|@..violinwidth..')\n", + " .line('density|@..density..')\n", + " .line('count|@..count..')\n", + " .line('scaled|@..scaled..')) + \\\n", " ggtitle(\"Grouping and tooltips\")" ] }, @@ -894,52 +954,62 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `coord_flip()`" + "## Facets" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(iris_df, aes('species', 'sepal_length')) + \\\n", - " geom_violin() + \\\n", - " coord_flip() + \\\n", - " ggtitle(\"Use coord_flip()\")" + "ggplot(mpg_df, aes(x='drv', y='hwy')) + \\\n", + " geom_violin(aes(fill=as_discrete('year')), draw_quantiles=DRAW_QUANTILES) + \\\n", + " facet_grid(y='year')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## \"identity\" statistic" + "## `coord_flip()`" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(violin_df, aes('species', 'sepal_length')) + \\\n", - " geom_violin(aes(weight='weight'), stat='identity') + \\\n", - " ggtitle(\"Use 'identity' statistic\")" + "ggplot(mpg_df, aes('drv', 'hwy')) + \\\n", + " geom_violin(draw_quantiles=DRAW_QUANTILES) + \\\n", + " coord_flip() + \\\n", + " ggtitle(\"Use coord_flip()\")" ] }, { @@ -1040,41 +1113,35 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(random_df, aes(as_discrete('c1', order=-1), 'v')) + \\\n", - " geom_violin(aes(color='c1', fill='c1'), alpha=.5, size=2, \\\n", + "ggplot(mpg_df, aes(as_discrete('drv', order=-1), 'hwy')) + \\\n", + " geom_violin(aes(color='drv', fill='drv'), alpha=.5, size=2, \\\n", + " n=8, draw_quantiles=DRAW_QUANTILES,\n", " sampling=sampling_group_systematic(2)) + \\\n", - " facet_grid(x='c2') + \\\n", - " scale_y_continuous(breaks=list(np.linspace(-3, 3, 9))) + \\\n", + " scale_y_continuous(breaks=list(range(12, 29, 2))) + \\\n", " scale_color_brewer(type='qual', palette='Set1') + \\\n", " scale_fill_brewer(type='qual', palette='Set1') + \\\n", - " ylim(-3, 3) + \\\n", - " coord_fixed(ratio=.5) + \\\n", + " ylim(12, 28) + \\\n", + " coord_fixed(ratio=.2) + \\\n", " theme_grey() + \\\n", " ggtitle(\"Some additional aesthetics, parameters and layers\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset with NaN's" - ] - }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", + "
\n", " " ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ggplot(nullable_df, aes('c1', 'v')) + geom_violin()" + "# Note: quartiles for violin need not to be equal to the quartiles for boxplot!\n", + "# See the last paragraph here: https://stackoverflow.com/a/36036821/11771414\n", + "quartiles = [1/4, 2/4, 3/4]\n", + "ggplot(mpg_df, aes(x='drv', y='hwy')) + \\\n", + " geom_violin(draw_quantiles=quartiles) + \\\n", + " geom_boxplot(width=.1)" ] } ], diff --git a/plot-base-portable/src/commonTest/kotlin/jetbrains/datalore/plot/base/stat/YDensityStatTest.kt b/plot-base-portable/src/commonTest/kotlin/jetbrains/datalore/plot/base/stat/YDensityStatTest.kt new file mode 100644 index 00000000000..0beb80ab2a8 --- /dev/null +++ b/plot-base-portable/src/commonTest/kotlin/jetbrains/datalore/plot/base/stat/YDensityStatTest.kt @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2022. JetBrains s.r.o. + * Use of this source code is governed by the MIT license that can be found in the LICENSE file. + */ + +package jetbrains.datalore.plot.base.stat + +import jetbrains.datalore.base.gcommon.collect.ClosedRange +import jetbrains.datalore.plot.base.DataFrame +import jetbrains.datalore.plot.base.StatContext +import jetbrains.datalore.plot.base.data.TransformVar +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +class YDensityStatTest { + private fun statContext(d: DataFrame): StatContext { + return SimpleStatContext(d) + } + + private fun dataFrame(dataMap: Map>): DataFrame { + val builder = DataFrame.Builder() + for (key in dataMap.keys) { + builder.put(key, dataMap.getValue(key)) + } + return builder.build() + } + + private fun filteredDataFrame(df: DataFrame, variable: DataFrame.Variable, filterFun: (Double?) -> Boolean): DataFrame { + val indices = df.getNumeric(variable) + .mapIndexed { index, v -> if (filterFun(v)) index else null } + .filterNotNull() + + return df.selectIndices(indices) + } + + private fun yDensityStat(scale: YDensityStat.Scale? = null): YDensityStat { + return YDensityStat( + scale = scale ?: YDensityStat.DEF_SCALE, + bandWidth = null, + bandWidthMethod = DensityStat.DEF_BW, + adjust = DensityStat.DEF_ADJUST, + kernel = DensityStat.DEF_KERNEL, + n = DensityStat.DEF_N, + fullScanMax = DensityStat.DEF_FULL_SCAN_MAX + ) + } + + private fun checkStatVar(statDf: DataFrame, variable: DataFrame.Variable) { + assertTrue(statDf.has(variable), "Has var " + variable.name) + } + + private fun checkStatVarAndValuesDomain(statDf: DataFrame, variable: DataFrame.Variable, expectedValuesDomain: Set) { + checkStatVar(statDf, variable) + assertEquals(statDf.getNumeric(variable).toSet(), expectedValuesDomain, "Unique values of var " + variable.name) + } + + private fun checkStatVarAndValuesRange(statDf: DataFrame, variable: DataFrame.Variable, expectedValuesRange: ClosedRange) { + checkStatVar(statDf, variable) + val actualMinValue = statDf.getNumeric(variable).minByOrNull { it!! }!! + assertEquals(expectedValuesRange.lowerEnd, actualMinValue, "Min value of var " + variable.name) + val actualMaxValue = statDf.getNumeric(variable).maxByOrNull { it!! }!! + assertEquals(expectedValuesRange.upperEnd, actualMaxValue, "Max value of var " + variable.name) + } + + private fun checkStatVarAndMaxValue(statDf: DataFrame, variable: DataFrame.Variable, expectedMaxValue: Double) { + checkStatVar(statDf, variable) + val actualMaxValue = statDf.getNumeric(variable).maxByOrNull { it!! }!! + assertEquals(expectedMaxValue, actualMaxValue, "Max value of var " + variable.name) + } + + private fun checkStatVarAndMaxLimit(statDf: DataFrame, variable: DataFrame.Variable, expectedMaxLimit: Double) { + checkStatVar(statDf, variable) + val actualMaxValue = statDf.getNumeric(variable).maxByOrNull { it!! }!! + assertTrue(expectedMaxLimit - actualMaxValue > 0, "Max value of var " + variable.name + " limited") + } + + @Test + fun emptyDataFrame() { + val df = dataFrame(emptyMap()) + val stat = yDensityStat() + val statDf = stat.normalize(stat.apply(df, statContext(df))) + + checkStatVarAndValuesDomain(statDf, Stats.X, emptySet()) + checkStatVarAndValuesDomain(statDf, Stats.Y, emptySet()) + checkStatVarAndValuesDomain(statDf, Stats.VIOLIN_WIDTH, emptySet()) + } + + @Test + fun oneElementDataFrame() { + val yValue = 3.14 + val df = dataFrame(mapOf( + TransformVar.Y to listOf(yValue) + )) + val stat = yDensityStat() + val statDf = stat.normalize(stat.apply(df, statContext(df))) + + checkStatVarAndValuesDomain(statDf, Stats.X, setOf(0.0)) + checkStatVarAndMaxValue(statDf, Stats.VIOLIN_WIDTH, 1.0) + } + + @Test + fun twoElementsInDataFrame() { + val y = listOf(2.71, 3.14) + val df = dataFrame(mapOf( + TransformVar.Y to y + )) + val stat = yDensityStat() + val statDf = stat.normalize(stat.apply(df, statContext(df))) + + checkStatVarAndValuesDomain(statDf, Stats.X, setOf(0.0)) + checkStatVarAndValuesRange(statDf, Stats.Y, ClosedRange(2.71, 3.14)) + checkStatVarAndMaxValue(statDf, Stats.VIOLIN_WIDTH, 1.0) + } + + @Test + fun withNanValues() { + val x = listOf(null, 4.0, 3.0, 3.0, 1.0, 1.0, 2.0, 2.0) + val y = listOf(3.0, null, 2.0, 3.0, 0.0, 1.0, 1.0, 2.0) + val df = dataFrame(mapOf( + TransformVar.X to x, + TransformVar.Y to y + )) + val stat = yDensityStat() + val statDf = stat.normalize(stat.apply(df, statContext(df))) + + checkStatVarAndValuesDomain(statDf, Stats.X, setOf(1.0, 2.0, 3.0)) + checkStatVarAndValuesRange(statDf, Stats.Y, ClosedRange(0.0, 3.0)) + checkStatVarAndMaxValue(statDf, Stats.VIOLIN_WIDTH, 1.0) + } + + @Test + fun changeScales() { + val x = listOf(0.0, 0.0, 0.0, 0.0, 1.0, 1.0) + val y = listOf(0.0, 1.0, 2.0, 3.0, 0.0, 1.0) + val df = dataFrame(mapOf( + TransformVar.X to x, + TransformVar.Y to y + )) + + for (scale in YDensityStat.Scale.values()) { + val stat = yDensityStat(scale = scale) + val statDf = stat.normalize(stat.apply(df, statContext(df))) + val statDf0 = filteredDataFrame(statDf, Stats.X) { it == 0.0 } + val statDf1 = filteredDataFrame(statDf, Stats.X) { it == 1.0 } + + checkStatVarAndValuesDomain(statDf, Stats.X, setOf(0.0, 1.0)) + checkStatVarAndValuesRange(statDf0, Stats.Y, ClosedRange(0.0, 3.0)) + checkStatVarAndValuesRange(statDf1, Stats.Y, ClosedRange(0.0, 1.0)) + when (scale) { + YDensityStat.Scale.AREA -> { + checkStatVarAndMaxLimit(statDf0, Stats.VIOLIN_WIDTH, 0.5) + checkStatVarAndMaxValue(statDf1, Stats.VIOLIN_WIDTH, 1.0) + } + YDensityStat.Scale.COUNT -> { + checkStatVarAndMaxLimit(statDf0, Stats.VIOLIN_WIDTH, 0.5) + checkStatVarAndMaxValue(statDf1, Stats.VIOLIN_WIDTH, 0.5) + } + YDensityStat.Scale.WIDTH -> { + checkStatVarAndMaxValue(statDf0, Stats.VIOLIN_WIDTH, 1.0) + checkStatVarAndMaxValue(statDf1, Stats.VIOLIN_WIDTH, 1.0) + } + } + } + } +} \ No newline at end of file diff --git a/plot-demo-common/src/commonMain/kotlin/jetbrains/datalore/plotDemo/model/plotConfig/Violin.kt b/plot-demo-common/src/commonMain/kotlin/jetbrains/datalore/plotDemo/model/plotConfig/Violin.kt index c160546d4ac..ba868a2d33b 100644 --- a/plot-demo-common/src/commonMain/kotlin/jetbrains/datalore/plotDemo/model/plotConfig/Violin.kt +++ b/plot-demo-common/src/commonMain/kotlin/jetbrains/datalore/plotDemo/model/plotConfig/Violin.kt @@ -14,14 +14,6 @@ class Violin { basic(), withNan(), withGroups(), - -// TODO: Move this to tests -// data132Violin(), -// data132ViolinDiscrete(), -// data132ViolinDefaultN(), -// data132ViolinIdentity(), -// data132Boxplot(), -// data123Violin(), ) } @@ -101,162 +93,4 @@ class Violin { return HashMap(parsePlotSpec(spec)) } - - private fun data132Violin(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 3, 2]," + - " 'y': [2, 0, 1]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 3, 2]'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'violin'," + - " 'n': 3" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } - - private fun data132ViolinDiscrete(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 3, 2]," + - " 'y': [2, 0, 1]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 3, 2] and discrete'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'violin'," + - " 'n': 3" + - " }" + - " ]," + - " 'scales': [" + - " {" + - " 'aesthetic': 'x'," + - " 'discrete': true" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } - - private fun data132ViolinDefaultN(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 3, 2]," + - " 'y': [2, 0, 1]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 3, 2], default n'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'violin'" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } - - private fun data132ViolinIdentity(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 1, 1, 3, 3, 3, 2, 2, 2]," + - " 'y': [4, 3, 2, 5, 4, 3, 3, 2, 1]," + - " 'vw': [0, 1, 0, 0, 1, 0, 0, 1, 0]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 3, 2], stat=identity'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'violin'," + - " 'mapping': {" + - " 'violinwidth': 'vw'" + - " }," + - " 'stat': 'identity'" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } - - private fun data132Boxplot(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 3, 2]," + - " 'y': [2, 0, 1]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 3, 2], geom=boxplot'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'boxplot'" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } - - private fun data123Violin(): MutableMap { - val spec = "{" + - " 'kind': 'plot'," + - " 'data' : {'x': [1, 2, 3]," + - " 'y': [2, 1, 0]" + - " }," + - " 'mapping': {" + - " 'x': 'x'," + - " 'y': 'y'" + - " }," + - " 'ggtitle': {" + - " 'text': 'x=[1, 2, 3]'" + - " }," + - " 'layers': [" + - " {" + - " 'geom': 'violin'," + - " 'n': 3" + - " }" + - " ]" + - "}" - - return HashMap(parsePlotSpec(spec)) - - } } \ No newline at end of file diff --git a/python-package/lets_plot/plot/geom.py b/python-package/lets_plot/plot/geom.py index 53f2c25b1b5..8102783719c 100644 --- a/python-package/lets_plot/plot/geom.py +++ b/python-package/lets_plot/plot/geom.py @@ -2788,6 +2788,136 @@ def geom_boxplot(mapping=None, *, data=None, stat=None, position=None, show_lege def geom_violin(mapping=None, *, data=None, stat=None, position=None, show_legend=None, sampling=None, tooltips=None, **other_args): + """ + A violin plot is a mirrored density plot with an additional grouping as for a boxplot. + + Parameters + ---------- + mapping : `FeatureSpec` + Set of aesthetic mappings created by `aes()` function. + Aesthetic mappings describe the way that variables in the data are + mapped to plot "aesthetics". + data : dict or `DataFrame` + The data to be displayed in this layer. If None, the default, the data + is inherited from the plot data as specified in the call to ggplot. + stat : str, default='ydensity' + The statistical transformation to use on the data for this layer, as a string. + position : str or `FeatureSpec` + Position adjustment, either as a string ('identity', 'stack', 'dodge', ...), + or the result of a call to a position adjustment function. + show_legend : bool, default=True + False - do not show legend for this layer. + sampling : `FeatureSpec` + Result of the call to the `sampling_xxx()` function. + Value None (or 'none') will disable sampling for this layer. + tooltips : `layer_tooltips` + Result of the call to the `layer_tooltips()` function. + Specifies appearance, style and content. + draw_quantiles : list of float + Draw horizontal lines at the given quantiles of the density estimate. + scale : {'area', 'count', 'width'}, default='area' + If 'area', all violins have the same area. + If 'count', areas are scaled proportionally to the number of observations. + If 'width', all violins have the same maximum width. + other_args + Other arguments passed on to the layer. + These are often aesthetics settings used to set an aesthetic to a fixed value, + like color='red', fill='blue', size=3 or shape=21. + They may also be parameters to the paired geom/stat. + + Returns + ------- + `LayerSpec` + Geom object specification. + + Notes + ----- + Computed variables: + + - ..violinwidth.. : density scaled for the violin plot, according to area, counts or to a constant maximum width (mapped by default). + - ..density.. : density estimate. + - ..count.. : density * number of points. + - ..scaled.. : density estimate, scaled to maximum of 1. + + `geom_violin()` understands the following aesthetics mappings: + + - x : x-axis coordinates. + - y : y-axis coordinates. + - alpha : transparency level of a layer. Understands numbers between 0 and 1. + - color (colour) : color of a geometry lines. Can be continuous or discrete. For continuous value this will be a color gradient between two colors. + - fill : color of geometry filling. + - size : lines width. + - linetype : type of the line of border. Codes and names: 0 = 'blank', 1 = 'solid', 2 = 'dashed', 3 = 'dotted', 4 = 'dotdash', 5 = 'longdash', 6 = 'twodash'. + - weight : used by 'ydensity' stat to compute weighted density. + + Examples + -------- + .. jupyter-execute:: + :linenos: + :emphasize-lines: 9 + + import numpy as np + from lets_plot import * + LetsPlot.setup_html() + n = 100 + np.random.seed(42) + x = np.random.choice(['a', 'b', 'c'], size=n) + y = np.random.normal(size=n) + ggplot({'x': x, 'y': y}, aes(x='x', y='y')) + \\ + geom_violin() + + | + + .. jupyter-execute:: + :linenos: + :emphasize-lines: 9 + + import numpy as np + from lets_plot import * + LetsPlot.setup_html() + n = 100 + np.random.seed(42) + x = np.random.choice(['a', 'b', 'b', 'c'], size=n) + y = np.random.normal(size=n) + ggplot({'x': x, 'y': y}, aes('x', 'y')) + \\ + geom_violin(scale='count', draw_quantiles=[.25, .5, .75]) + + | + + .. jupyter-execute:: + :linenos: + :emphasize-lines: 10 + + import numpy as np + from lets_plot import * + LetsPlot.setup_html() + n = 3 + np.random.seed(42) + x = ['a'] * n + ['b'] * n + ['c'] * n + y = 3 * list(range(n)) + vw = np.random.uniform(size=3*n) + ggplot({'x': x, 'y': y, 'vw': vw}, aes('x', 'y')) + \\ + geom_violin(aes(violinwidth='vw', fill='x'), stat='identity') + + | + .. jupyter-execute:: + :linenos: + :emphasize-lines: 10-11 + + import numpy as np + import pandas as pd + from lets_plot import * + LetsPlot.setup_html() + n, m = 100, 5 + np.random.seed(42) + df = pd.DataFrame({'x%s' % i: np.random.normal(size=n) \\ + for i in range(1, m + 1)}) + ggplot(df.melt(), aes('variable', 'value')) + \\ + geom_violin(aes(color='variable', fill='variable'), \\ + size=2, alpha=.5, scale='width') + \\ + geom_boxplot(aes(fill='variable'), width=.2) + + """ return _geom('violin', mapping=mapping, data=data,