diff --git a/.drone.yml b/.drone.yml index 5ab8e1a8..7f6a42b4 100644 --- a/.drone.yml +++ b/.drone.yml @@ -37,6 +37,7 @@ steps: - apt-get install --assume-yes libxml2 - apt-get install --assume-yes libxml2-dev - apt-get install --assume-yes libglpk-dev + - apt-get install --assume-yes libfontconfig1-dev - echo 'options(repos = c(CRAN = "https://cloud.r-project.org"))' >>"/usr/local/lib/R/etc/Rprofile.site" # package installation - Rscript install.R @@ -44,6 +45,12 @@ steps: - Rscript tests.R depends_on: [clone] +- name: R-4.2 + pull: if-not-exists + image: rocker/r-ver:4.2.1 + commands: *runTests + depends_on: [clone] + - name: R-4.1 pull: if-not-exists image: rocker/r-ver:4.1.3 @@ -96,6 +103,7 @@ steps: - apt-get install --assume-yes libxml2 - apt-get install --assume-yes libxml2-dev - apt-get install --assume-yes libglpk-dev + - apt-get install --assume-yes libfontconfig1-dev - echo 'options(repos = c(CRAN = "https://cloud.r-project.org"))' >>"/usr/local/lib/R/etc/Rprofile.site" # package installation - Rscript install.R @@ -103,6 +111,12 @@ steps: - Rscript showcase.R depends_on: [clone] +- name: R-4.2 + pull: if-not-exists + image: rocker/r-ver:4.2.1 + commands: *runShowcase + depends_on: [clone] + - name: R-4.1 pull: if-not-exists image: rocker/r-ver:4.1.3 diff --git a/NEWS.md b/NEWS.md index 242dba31..7544432a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,19 +2,65 @@ # coronet – Changelog +## 4.2 + +### Added +- Incorporate custom event timestamps, i.e., add a configuration entry to the project configuration that allows specifying a file from which timestamps can be read, as well as an entry that allows locking this data; add corresponding functions `get.custom.event.timestamps`, `set.custom.event.timestamps` and `clear.custom.event.timestamps` (PR #227, 0aa342430ad3b354b9cf954dbe0838b056cf328a, 0f237d03913d2c940a008ea8fe84ba44817e77ea, c1803982357a3272b108f60cb1c976e3c2d9b1e5, 54e089db0ceea07db94914d02655a7f1f67d3117, 54673f8f88ca276ba06396116d802425093544d4, c5f5403430d55ceff6b6d5acbbca1ae9c5c231e2) +- Add function `split.data.time.based.by.timestamps` to allow using custom event timestamps for splitting. Alternatively, timestamps can be specified manually (PR #227, 5b8515f97da4a24f971be453589595d259ab1fa1, 43f23a83bc66e485fea371f958bbb2ce3ddbd8d0) +- Add the following vertex attributes for artifact vertices and corresponding helper functions (PR #229, 20728071ca25e1d20cfa05bc15feb3ecc0a1c434, 51b5478ae15598ed3e6115b22e440929f8084660, 56ed57a21cc8004262ebba88429d0649cb238a52, 9b060361b1d1352b5a431df3990e468df7cab572, 52d40ba3657e3c806516653626afd81018a14863, e91161c79b53be7ba8ce3bec65de01ea6be1c575) + - `add.vertex.attribute.artifact.last.edited` + - `add.vertex.attribute.mail.thread.contributer.count`, `get.mail.thread.contributor.count` + - `add.vertex.attribute.mail.thread.message.count`, `get.mail.thread.message.count` + - `add.vertex.attribute.mail.thread.start.date`, `get.mail.thread.start.date` + - `add.vertex.attribute.mail.thread.end.date`, `get.mail.thread.end.date` + - `add.vertex.attribute.mail.thread.originating.mailing.list`, `get.mail.thread.originating.mailing.list` + - `add.vertex.attribute.issue.contributor.count`, `get.issue.contributor.count` + - `add.vertex.attribute.issue.event.count`, `get.issue.event.count` + - `add.vertex.attribute.issue.comment.event.count`, `get.issue.comment.count` + - `add.vertex.attribute.issue.opened.date`, `get.issue.opened.date` + - `add.vertex.attribute.issue.closed.date`, `get.issue.closed.date` + - `add.vertex.attribute.issue.last.activity.date`, `get.issue.last.activity.date` + - `add.vertex.attribute.issue.title`, `get.issue.title` + - `add.vertex.attribute.pr.open.merged.or.closed`, `get.pr.open.merged.or.closed` + - `add.vertex.attribute.issue.is.pull.request`, `get.issue.is.pull.request` + +### Changed/Improved +- **Breaking Change**: Rename existing vertex attributes for author vertices to be distinguishable from attributes for artifact vertices. With this change, the first word after `add.vertex.attribute.` now signifies the type of vertex the attribute applies to (PR #229, 75e8514d1d2f6222d2093679f4418e9171d3abf2) + - `add.vertex.attribute.commit.count.author` -> `add.vertex.attribute.author.commit.count` + - `add.vertex.attribute.commit.count.author.not.committer` -> `add.vertex.attribute.author.commit.count.not.committer` + - `add.vertex.attribute.commit.count.committer` -> `add.vertex.attribute.author.commit.count.committer` + - `add.vertex.attribute.commit.count.committer.not.author` -> `add.vertex.attribute.author.commit.count.committer.not.author` + - `add.vertex.attribute.commit.count.committer.and.author` -> `add.vertex.attribute.author.commit.count.committer.and.author` + - `add.vertex.attribute.commit.count.committer.or.author` -> `add.vertex.attribute.author.commit.count.committer.or.author` + - `add.vertex.attribute.artifact.count` -> `add.vertex.attribute.author.artifact.count` + - `add.vertex.attribute.mail.count` -> `add.vertex.attribute.author.mail.count` + - `add.vertex.attribute.mail.thread.count` -> `add.vertex.attribute.author.mail.thread.count` + - `add.vertex.attribute.issue.count` -> `add.vertex.attribute.author.issue.count` + - `add.vertex.attribute.issues.commented.count` -> `add.vertex.attribute.author.issues.commented.count` + - `add.vertex.attribute.issue.creation.count` -> `add.vertex.attribute.author.issue.creation.count` + - `add.vertex.attribute.issue.comment.count` -> `add.vertex.attribute.author.issue.comment.count` + - `add.vertex.attribute.first.activity` -> `add.vertex.attribute.author.first.activity` + - `add.vertex.attribute.active.ranges` -> `add.vertex.attribute.author.active.ranges` +- Add parameter `use.unfiltered.data` to `add.vertex.attribute.issue.*`. This allows selecting whether the filtered or unfiltered issue data is used for calculating the attribute (PR #229, b77601dfa1372af5f58fb552cdb015401a344df7, 922258cb743614e0eeffcf38028acfc0a42a0332) +- Improve handling of issue type in vertex attribute name for `add.vertex.attribute.issue.*`. The default attribute name still adjusts to the issue type, but this no longer happens if the same name is specified manually (PR #229, fe5dc61546b81c7779643c3b2b37c101a55217f8) + + ## 4.1 ### Added - Incorporate gender data, i.e., add a configuration entry to the project configuration, add function `read.gender` for reading gender data, add functions `get.gender` and `set.gender` and corresponding utility functions to automatically merge gender data to the author data (PR #216, 8868ff47900cf804553ec98683b736b07350fc64, bfbe4deb9d14faeed56bdf37f9f732e01c41af57, 0a23862c6308c27fe4f93835c3a4480eac03ca91, a7744b548ac5ab697a4eb3d71987ddedef180d59, 6a50fd15bdd6382fa3d868a21a41a4b0a36ffce7, 413e24c18532d06144ef996184192594a0893ca3, 39db3158e931fa627e974451ae66c57bd0b77b12, 1e4026def1995a23b3f42eac5eb343ee5a518798) +- Add testing utillity file `tests/testing-utils.R` to make parameterized tests on the cross-product of multiple parameter values possible (d876f776439a52a3d34647d16a49ff39379e6da2, 9a1982051cc9849e07f6337ee8141d69def709db, 4dd5896d743c958bbcd6dda16feb50dc03c3a518) ### Changed/Improved - Add `mode` parameter to `metrics.vertex.degrees` to allow choosing between indegree, outdegree, and total (#219, ae14eb4cb83c6ab8f387886228cdf7ea6f3258c4) - Adjust `.drone.yml` CI config to prevent pipeline fails: `R` version `3.3` is not tested any more as some packages are not available any more for this `R` version (ca6b474d773c045dd88a19aee905283a373df0a6). Also another docker container in the CI pipeline is used as there are problems with the previously used docker instance (937f797ee04b78a087ea84043d075e7ca7558d70) +- Add `remove.isolates` parameter to `extract.bipartite.network.from.network`. The default value is `FALSE`, chosen to be consistent with `get.bipartite.network` and other network extraction methods. Previously, isolates were always removed (PR #226, b58394bde421e19eab3470f2266dfff9a7a2dca9, 079a256861a7621118b68bf09ba2dc53efc5f70e) ### Fixed - Fix values in test for the eigenvector centrality as igraph has changed the calculation of this with version 1.2.7. Also put a warning that we recommend version 1.3.0 in `install.R` and document it in the `README.md` (25fb86277c7cc15b94ca0327bff4bb7e818ca09b, 1bcbca96d6dbaa2d4a28e830da963604682eac70) - Fix the filtering of the deleted user in `util-read.R` to always be lowercase as the deleted user can appear with different spellings (#214, 1b4072c7ec0e33a595e31d9e9d27bb5c133b1556) - Add check to `get.first.activity.data` to look for missing activity types. If no activities are in the RangeData, the function will print a warning and return an empty list (PR #220, #217, 5707517600c5579095c245b63c745d01cde02799, 42a4befb36e7fd9830924dc7fb2e04ecdf86e209, d6424c03baff05562448df1b6b87828ca9a37b88, ca8a1b4c628261dcb471e1da3603439e75e4cc56, f6553c6106e5fec3837c6edb906a4d0960c5c5fb) +- Fix setting split.length properly in splitting info of the project configuration when the length is determined by splitting with the number.windows parameter (PR #222, 2bab846be4ca34fdc45047ec2ddb610c7aeaa555, b467a018b1fbd70ba7848196f520a9202dc319b0) ## 4.0 diff --git a/README.md b/README.md index c2a6c964..2349cd95 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,7 @@ There are two distinguishable types of data sources that are both handled by the * Patch-stack analysis to link patches sent to mailing lists and upstream commits * Synchronicity information on commits (see also the parameter `synchronicity` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class) * Synchronous commits are commits that change a source-code artifact that has also been changed by another author within a reasonable time-window. + * Custom event timestamps, which have to be specified manually (see also the parameter `custom.event.timestamps.file` in the [`ProjectConf`](#configurable-data-retrieval-related-parameters) class) The important difference is that the *main data sources* are used internally to construct artifact vertices in relevant types of networks. Additionally, these data sources can be used as a basis for splitting `ProjectData` in a time-based or activity-based manner – obtaining `RangeData` instances as a result (see file `split.R` and the contained functions). Thus, `RangeData` objects contain only data of a specific period of time. @@ -589,6 +590,13 @@ There is no way to update the entries, except for the revision-based parameters. * The time-window (in days) to use for synchronicity data if enabled by `synchronicity = TRUE` * [1, *5*, 10, 15] * **Note**: If, at least, one artifact in a commit has been edited by more than one developer within the configured time window, then the whole commit is considered to be synchronous. +- `custom.event.timestamps.file`: + * The file to read custom timestamps from. + * **Note** It might make sense to keep several lists of timestamps for different purposes. Therefore, this is the only data source where the file name can be configured. + * **Note** This parameter does not have a default value. +- `custom.event.timestamps.locked`: + * Lock custom event timestamps to prevent them from being read if empty or not yet present when calling the getter. + * [`TRUE`, *`FALSE`*] ### NetworkConf diff --git a/showcase.R b/showcase.R index 031a919b..30dfc91c 100644 --- a/showcase.R +++ b/showcase.R @@ -22,6 +22,7 @@ ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Niklas Schneider +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -71,6 +72,8 @@ ARTIFACT.RELATION = "cochange" # cochange, callgraph, mail, issue ## initialize project configuration proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("commits.filter.base.artifact", TRUE) +## specify that custom event timestamps should be read from 'custom-events.list' +proj.conf$update.value("custom.event.timestamps.file", "custom-events.list") proj.conf$print() ## initialize network configuration @@ -128,6 +131,7 @@ x.data$get.data.path() x.data$group.artifacts.by.data.column("mails", "author.name") x.data$group.artifacts.by.data.column("commits", "hash") x.data$filter.bots(x.data$get.commits.uncached(remove.untracked.files = TRUE, remove.base.artifact = FALSE, filter.bots = FALSE)) +x.data$get.custom.event.timestamps() ## * Network construction -------------------------------------------------- @@ -217,16 +221,16 @@ my.networks = lapply(cf.data, function(range.data) { return (y$get.author.network()) }) ## add commit-count vertex attributes -sample = add.vertex.attribute.commit.count.author(my.networks, x.data, aggregation.level = "range") -sample.cumulative = add.vertex.attribute.commit.count.author(my.networks, x.data, aggregation.level = "cumulative") +sample = add.vertex.attribute.author.commit.count(my.networks, x.data, aggregation.level = "range") +sample.cumulative = add.vertex.attribute.author.commit.count(my.networks, x.data, aggregation.level = "cumulative") ## add email-address vertex attribute sample.mail = add.vertex.attribute.author.email(my.networks, x.data, "author.email") -sample.mail.thread = add.vertex.attribute.mail.thread.count(my.networks, x.data) -sample.issues.created = add.vertex.attribute.issue.creation.count(my.networks, x.data) -sample.pull.requests = add.vertex.attribute.issue.count(my.networks, x.data, issue.type = "pull.requests") +sample.mail.thread = add.vertex.attribute.author.mail.thread.count(my.networks, x.data) +sample.issues.created = add.vertex.attribute.author.issue.creation.count(my.networks, x.data) +sample.pull.requests = add.vertex.attribute.author.issue.count(my.networks, x.data, issue.type = "pull.requests") ## add vertex attributes for the project-level network x.net.as.list = list("1970-01-01 00:00:00-2030-01-01 00:00:00" = x$get.author.network()) -sample.entire = add.vertex.attribute.commit.count.author(x.net.as.list, x.data, aggregation.level = "complete") +sample.entire = add.vertex.attribute.author.commit.count(x.net.as.list, x.data, aggregation.level = "complete") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -293,6 +297,15 @@ for (range in names(cf.data)) { } print(run.lapply(cf.data, "get.class.name")) +## we can also use custom event timestamps for splitting +cf.data = split.data.time.based.by.timestamps(x.data) +for (range in names(cf.data)) { + y.data = cf.data[[range]] + y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf) + plot.network(y$get.bipartite.network()) +} +print(run.lapply(cf.data, "get.class.name")) + cf.data = split.data.activity.based(x.data, activity.amount = 10000, activity.type = "mails") for (range in names(cf.data)) { y.data = cf.data[[range]] @@ -430,7 +443,7 @@ get.author.class.by.type(network = empty.network, type = "network.eigen") get.author.class.by.type(proj.data = empty.range.data, type = "commit.count") get.author.class.by.type(proj.data = empty.range.data, type = "loc.count") -## test function for mutliple ranges (evolution) +## test function for multiple ranges (evolution) author.class.overview = get.author.class.overview(network.list = network.list, type = "network.degree") get.author.class.overview(network.list = network.list, type = "network.eigen") get.author.class.overview(range.data.list = range.list, type = "commit.count") @@ -449,3 +462,4 @@ calculate.cohens.kappa(author.classification.list = author.class.overview, get.class.turnover.overview(author.class.overview = author.class.overview) get.unstable.authors.overview(author.class.overview = author.class.overview, saturation = 2) + diff --git a/tests.R b/tests.R index 1e61fdb1..b1472de0 100644 --- a/tests.R +++ b/tests.R @@ -13,12 +13,14 @@ ## ## Copyright 2017, 2019 by Claus Hunsen ## Copyright 2020-2021 by Thomas Bock +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Initialization ---------------------------------------------------------- source("util-init.R") +source("tests/testing-utils.R") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/tests/README.md b/tests/README.md index f50c43b3..6eb55791 100644 --- a/tests/README.md +++ b/tests/README.md @@ -16,9 +16,13 @@ We have two test projects you can use when writing your tests: * Commit messages * Pasta * Synchronicity + * Custom event timestamps in `custom-events.list` + * Revisions 2. - Casestudy: `test_empty` - Selection process: `testing` - Contains the following data: * Authors + * Revisions -Please note, that there cannot be a project without author data as in this case, `coronet` stops when reading the data. Everything else can be empty. +Please note that all projects must have author and revision data as otherwise, `coronet` stops when reading the data. +Everything else can be empty. diff --git a/tests/codeface-data/configurations/testing/test_feature.conf b/tests/codeface-data/configurations/testing/test_feature.conf index bade450e..971289b5 100644 --- a/tests/codeface-data/configurations/testing/test_feature.conf +++ b/tests/codeface-data/configurations/testing/test_feature.conf @@ -8,6 +8,9 @@ mailinglists: - name: test type: dev source: gmane + - name: test2 + type: dev + source: gmane # date of first release: # 2009-03-05 diff --git a/tests/codeface-data/configurations/testing/test_proximity.conf b/tests/codeface-data/configurations/testing/test_proximity.conf index d6e7c1d6..f9e29177 100644 --- a/tests/codeface-data/configurations/testing/test_proximity.conf +++ b/tests/codeface-data/configurations/testing/test_proximity.conf @@ -8,6 +8,9 @@ mailinglists: - name: test type: dev source: gmane + - name: test2 + type: dev + source: gmane # date of first release: # 2009-03-05 diff --git a/tests/codeface-data/results/testing/test_empty_feature/feature/custom-events.list b/tests/codeface-data/results/testing/test_empty_feature/feature/custom-events.list new file mode 100644 index 00000000..e69de29b diff --git a/tests/codeface-data/results/testing/test_empty_proximity/proximity/custom-events.list b/tests/codeface-data/results/testing/test_empty_proximity/proximity/custom-events.list new file mode 100644 index 00000000..e69de29b diff --git a/tests/codeface-data/results/testing/test_feature/feature/custom-events.list b/tests/codeface-data/results/testing/test_feature/feature/custom-events.list new file mode 100644 index 00000000..5d40db13 --- /dev/null +++ b/tests/codeface-data/results/testing/test_feature/feature/custom-events.list @@ -0,0 +1,5 @@ +"Test event 1";"2016-07-12 15:00:00" +"Test event 5";"2016-10-05 09:00:00" +"Test event 4";"2016-08-08" +"Test event 3";"2016-07-12 16:05:00" +"Test event 2";"2016-07-12 16:00:00" diff --git a/tests/codeface-data/results/testing/test_feature/feature/emails.list b/tests/codeface-data/results/testing/test_feature/feature/emails.list index 6bf6234f..37300429 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/emails.list +++ b/tests/codeface-data/results/testing/test_feature/feature/emails.list @@ -1,17 +1,17 @@ -"Björn";"bjoern@example.org";"";"2004-10-09 18:38:13";200;"Re: Fw: busybox 202 with tab";1 -"Björn";"bjoern@example.org";"<1107974989.17910.6.camel@jmcmullan>";"2005-02-09 18:49:49";-500;"Doubled date";2 -"udo";"udo@example.org";"";"2010-07-12 10:05:36";200;"Only mail address";3 -"Fritz fritz@example.org";"asd@sample.org";"";"2010-07-12 11:05:35";200;"name is mail address";4 -"georg";"heinz@example.org";"";"2010-07-12 12:05:34";200;"name is mail address";5 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:40";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:41";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:42";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:43";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:44";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:45";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:46";200;"name is mail address";7 -"Thomas";"thomas@example.org";"";"";0;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= 2";8 -"Björn";"bjoern@example.org";"<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>";"2016-07-12 15:58:40";0;"Re: busybox 1";8 -"Olaf";"olaf@example.org";"<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>";"2016-07-12 15:58:50";-400;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= tab";8 -"Thomas";"thomas@example.org";"<65a1sf31sagd684dfv31@mail.gmail.com>";"2016-07-12 16:04:40";100;"Re: Fw: busybox 2 tab";9 -"Olaf";"olaf@example.org";"<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>";"2016-07-12 16:05:37";200;"Re: Fw: busybox 10";9 +"Björn";"bjoern@example.org";"";"2004-10-09 18:38:13";200;"Re: Fw: busybox 202 with tab";"13#1" +"Björn";"bjoern@example.org";"<1107974989.17910.6.camel@jmcmullan>";"2005-02-09 18:49:49";-500;"Doubled date";"42#2" +"udo";"udo@example.org";"";"2010-07-12 10:05:36";200;"Only mail address";"13#3" +"Fritz fritz@example.org";"asd@sample.org";"";"2010-07-12 11:05:35";200;"name is mail address";"42#4" +"georg";"heinz@example.org";"";"2010-07-12 12:05:34";200;"name is mail address";"42#5" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:40";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:41";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:42";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:43";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:44";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:45";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:46";200;"name is mail address";"42#7" +"Thomas";"thomas@example.org";"";"";0;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= 2";"13#8" +"Björn";"bjoern@example.org";"<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>";"2016-07-12 15:58:40";0;"Re: busybox 1";"13#8" +"Olaf";"olaf@example.org";"<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>";"2016-07-12 15:58:50";-400;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= tab";"13#8" +"Thomas";"thomas@example.org";"<65a1sf31sagd684dfv31@mail.gmail.com>";"2016-07-12 16:04:40";100;"Re: Fw: busybox 2 tab";"13#9" +"Olaf";"olaf@example.org";"<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>";"2016-07-12 16:05:37";200;"Re: Fw: busybox 10";"13#9" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/custom-events.list b/tests/codeface-data/results/testing/test_proximity/proximity/custom-events.list new file mode 100644 index 00000000..5d40db13 --- /dev/null +++ b/tests/codeface-data/results/testing/test_proximity/proximity/custom-events.list @@ -0,0 +1,5 @@ +"Test event 1";"2016-07-12 15:00:00" +"Test event 5";"2016-10-05 09:00:00" +"Test event 4";"2016-08-08" +"Test event 3";"2016-07-12 16:05:00" +"Test event 2";"2016-07-12 16:00:00" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/emails.list b/tests/codeface-data/results/testing/test_proximity/proximity/emails.list index 6bf6234f..37300429 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/emails.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/emails.list @@ -1,17 +1,17 @@ -"Björn";"bjoern@example.org";"";"2004-10-09 18:38:13";200;"Re: Fw: busybox 202 with tab";1 -"Björn";"bjoern@example.org";"<1107974989.17910.6.camel@jmcmullan>";"2005-02-09 18:49:49";-500;"Doubled date";2 -"udo";"udo@example.org";"";"2010-07-12 10:05:36";200;"Only mail address";3 -"Fritz fritz@example.org";"asd@sample.org";"";"2010-07-12 11:05:35";200;"name is mail address";4 -"georg";"heinz@example.org";"";"2010-07-12 12:05:34";200;"name is mail address";5 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:40";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:41";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:42";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:43";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:44";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:45";200;"name is mail address";6 -"Hans";"hans1@example.org";"";"2010-07-12 12:05:46";200;"name is mail address";7 -"Thomas";"thomas@example.org";"";"";0;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= 2";8 -"Björn";"bjoern@example.org";"<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>";"2016-07-12 15:58:40";0;"Re: busybox 1";8 -"Olaf";"olaf@example.org";"<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>";"2016-07-12 15:58:50";-400;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= tab";8 -"Thomas";"thomas@example.org";"<65a1sf31sagd684dfv31@mail.gmail.com>";"2016-07-12 16:04:40";100;"Re: Fw: busybox 2 tab";9 -"Olaf";"olaf@example.org";"<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>";"2016-07-12 16:05:37";200;"Re: Fw: busybox 10";9 +"Björn";"bjoern@example.org";"";"2004-10-09 18:38:13";200;"Re: Fw: busybox 202 with tab";"13#1" +"Björn";"bjoern@example.org";"<1107974989.17910.6.camel@jmcmullan>";"2005-02-09 18:49:49";-500;"Doubled date";"42#2" +"udo";"udo@example.org";"";"2010-07-12 10:05:36";200;"Only mail address";"13#3" +"Fritz fritz@example.org";"asd@sample.org";"";"2010-07-12 11:05:35";200;"name is mail address";"42#4" +"georg";"heinz@example.org";"";"2010-07-12 12:05:34";200;"name is mail address";"42#5" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:40";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:41";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:42";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:43";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:44";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:45";200;"name is mail address";"42#6" +"Hans";"hans1@example.org";"";"2010-07-12 12:05:46";200;"name is mail address";"42#7" +"Thomas";"thomas@example.org";"";"";0;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= 2";"13#8" +"Björn";"bjoern@example.org";"<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>";"2016-07-12 15:58:40";0;"Re: busybox 1";"13#8" +"Olaf";"olaf@example.org";"<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>";"2016-07-12 15:58:50";-400;"=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= tab";"13#8" +"Thomas";"thomas@example.org";"<65a1sf31sagd684dfv31@mail.gmail.com>";"2016-07-12 16:04:40";100;"Re: Fw: busybox 2 tab";"13#9" +"Olaf";"olaf@example.org";"<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>";"2016-07-12 16:05:37";200;"Re: Fw: busybox 10";"13#9" diff --git a/tests/test-data-cut.R b/tests/test-data-cut.R index fcd8bec3..d1f3ef2a 100644 --- a/tests/test-data-cut.R +++ b/tests/test-data-cut.R @@ -69,7 +69,7 @@ test_that("Cut commit and mail data to same date range.", { date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), date.offset = as.integer(c(100, 200)), subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), - thread = sprintf("", c(9, 9)), + thread = sprintf("", c("13#9", "13#9")), artifact.type = c("Mail", "Mail")) commit.data = x.data$get.data.cut.to.same.date(data.sources = data.sources)$get.commits.unfiltered() diff --git a/tests/test-data.R b/tests/test-data.R index 188e37cd..1d06a34c 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -18,6 +18,7 @@ ## Copyright 2020-2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -154,7 +155,7 @@ test_that("Compare two ProjectData objects on non-empty data", { expect_true(proj.data.one$equals(proj.data.two), "Two identical ProjectData objects (issues.filtered).") ## test 'equals' on gender - proj.data.two$get.gender() + proj.data.two$get.gender() expect_false(proj.data.one$equals(proj.data.two), "Two non-identical ProjectData objects (gender).") proj.data.one$get.gender() @@ -420,6 +421,52 @@ test_that("Filter bots from mail data", { expect_equal(nrow(filtered.mails), 15) }) +test_that("Re-read custom events after config change", { + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data = ProjectData$new(proj.conf) + + expect_identical(proj.data$get.custom.event.timestamps(), list(), "Unconfigured timestamps.") + + proj.data$set.project.conf.entry("custom.event.timestamps.file", "revisions.list") + timestamps.revisions = proj.data$get.custom.event.timestamps() + timestamps.revisions.expected = list( + v1 = "2016-07-12 15:58:08", + v2 = "2016-07-12 16:00:45", + v3 = "2016-07-12 16:06:32" + ) + expect_identical(timestamps.revisions, timestamps.revisions.expected, "Custom timestamps.") + + proj.data$set.project.conf.entry("custom.event.timestamps.file", "custom-events.list") + timestamps.events = proj.data$get.custom.event.timestamps() + timestamps.events.expected = list( + "Test event 1" = "2016-07-12 15:00:00", + "Test event 2" = "2016-07-12 16:00:00", + "Test event 3" = "2016-07-12 16:05:00", + "Test event 4" = "2016-08-08", + "Test event 5" = "2016-10-05 09:00:00" + ) + + expect_identical(timestamps.events, timestamps.events.expected, "Custom timestamps.") + + proj.data$set.project.conf.entry("custom.event.timestamps.file", "nonexistent.file") + expect_identical(proj.data$get.custom.event.timestamps(), list(), "Empty timestamps from invalid file") + + proj.data$set.project.conf.entry("custom.event.timestamps.file", "custom-events.list") + timestamps.events = proj.data$get.custom.event.timestamps() + timestamps.events.expected = list( + "Test event 1" = "2016-07-12 15:00:00", + "Test event 2" = "2016-07-12 16:00:00", + "Test event 3" = "2016-07-12 16:05:00", + "Test event 4" = "2016-08-08", + "Test event 5" = "2016-10-05 09:00:00" + ) + expect_identical(timestamps.events, timestamps.events.expected, "Custom timestamps.") + + proj.data$set.project.conf.entry("custom.event.timestamps.locked", TRUE) + proj.data$clear.custom.event.timestamps() + expect_identical(proj.data$get.custom.event.timestamps(), list(), "cleared and locked timestamps") +}) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Data paths of RangeData ------------------------------------------------- diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 613a5220..5ebaf02e 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -20,6 +20,7 @@ ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2021 by Johannes Hostert ## Copyright 2021-2022 by Niklas Schneider +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -59,9 +60,12 @@ myranges.since.2010 = construct.ranges(mybins.since.2010, sliding.window = FALSE #' @return Tuple containing project data and list of networks get.network.covariates.test.networks = function(network.type = c("author", "artifact"), issues = FALSE, author.relation = c("cochange", "issue", "mail"), - bins = mybins) { + artifact.relation = c("cochange", "issue", "mail"), + bins = mybins, + issues.only.comments = FALSE) { author.relation = match.arg(author.relation) + artifact.relation = match.arg(artifact.relation) network.type.function = switch(match.arg(network.type), "author" = "get.author.network", "artifact" = "get.artifact.network") @@ -70,9 +74,10 @@ get.network.covariates.test.networks = function(network.type = c("author", "arti proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("commits.filter.base.artifact", FALSE) proj.conf$update.value("commits.filter.untracked.files", TRUE) - proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.value("issues.only.comments", issues.only.comments) net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = author.relation, simplify = FALSE)) + net.conf$update.values(list(author.relation = author.relation, artifact.relation = artifact.relation, + simplify = FALSE)) ## retrieve project data project.data = ProjectData$new(proj.conf) @@ -309,7 +314,7 @@ get.expected.first.activity = function() { return(expected.attributes) } -#' Helper for tests of the function add.vertex.attribute.active.ranges: Returns the expected active ranges per range, +#' Helper for tests of the function add.vertex.attribute.author.active.ranges: Returns the expected active ranges per range, #' author and data source as a nested list. #' #' @return A list with elements that represent the range (the test data is split to build one network per range), each @@ -434,8 +439,8 @@ test_that("Test split.and.add.vertex.attribute", { }) }) -#' Test the add.vertex.attribute.commit.count.author method -test_that("Test add.vertex.attribute.commit.count.author", { +#' Test the add.vertex.attribute.author.commit.count method +test_that("Test add.vertex.attribute.author.commit.count", { ## Test setup networks.and.data = get.network.covariates.test.networks() @@ -451,7 +456,7 @@ test_that("Test add.vertex.attribute.commit.count.author", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.commit.count.author( + networks.with.attr = add.vertex.attribute.author.commit.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -461,8 +466,8 @@ test_that("Test add.vertex.attribute.commit.count.author", { }) }) -#' Test the add.vertex.attribute.commit.count.committer.and.author method -test_that("Test add.vertex.attribute.commit.count.committer.and.author", { +#' Test the add.vertex.attribute.author.commit.count.committer.and.author method +test_that("Test add.vertex.attribute.author.commit.count.committer.and.author", { ## Test setup networks.and.data = get.network.covariates.test.networks() @@ -479,7 +484,7 @@ test_that("Test add.vertex.attribute.commit.count.committer.and.author", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.commit.count.committer.and.author( + networks.with.attr = add.vertex.attribute.author.commit.count.committer.and.author( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -489,8 +494,8 @@ test_that("Test add.vertex.attribute.commit.count.committer.and.author", { }) }) -#' Test the add.vertex.attribute.commit.count.committer.or.author method -test_that("Test add.vertex.attribute.commit.count.committer.or.author", { +#' Test the add.vertex.attribute.author.commit.count.committer.or.author method +test_that("Test add.vertex.attribute.author.commit.count.committer.or.author", { ## Test setup networks.and.data = get.network.covariates.test.networks() @@ -507,7 +512,7 @@ test_that("Test add.vertex.attribute.commit.count.committer.or.author", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.commit.count.committer.or.author( + networks.with.attr = add.vertex.attribute.author.commit.count.committer.or.author( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -517,8 +522,8 @@ test_that("Test add.vertex.attribute.commit.count.committer.or.author", { }) }) -#' Test the add.vertex.attribute.mail.count method -test_that("Test add.vertex.attribute.mail.count", { +#' Test the add.vertex.attribute.author.mail.count method +test_that("Test add.vertex.attribute.author.mail.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(author.relation = "mail", bins = mybins.since.2010) @@ -534,7 +539,7 @@ test_that("Test add.vertex.attribute.mail.count", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.mail.count( + networks.with.attr = add.vertex.attribute.author.mail.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -544,8 +549,8 @@ test_that("Test add.vertex.attribute.mail.count", { }) }) -#' Test the add.vertex.attribute.mail.count method -test_that("Test add.vertex.attribute.mail.thread.count", { +#' Test the add.vertex.attribute.author.mail.thread.count method +test_that("Test add.vertex.attribute.author.mail.thread.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(author.relation = "mail", bins = mybins.since.2010) @@ -561,7 +566,7 @@ test_that("Test add.vertex.attribute.mail.thread.count", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.mail.thread.count( + networks.with.attr = add.vertex.attribute.author.mail.thread.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -586,8 +591,8 @@ sum.expected.attributes = function(expected.attributes.issues.only, expected.att return(result) } -#' Test the add.vertex.attribute.issue.count method -test_that("Test add.vertex.attribute.issue.count", { +#' Test the add.vertex.attribute.author.issue.count method +test_that("Test add.vertex.attribute.author.issue.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(issues=TRUE, author.relation = "issue") @@ -614,7 +619,7 @@ test_that("Test add.vertex.attribute.issue.count", { ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.count( + networks.with.attr = add.vertex.attribute.author.issue.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) @@ -626,7 +631,7 @@ test_that("Test add.vertex.attribute.issue.count", { # Test PRs only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.count( + networks.with.attr = add.vertex.attribute.author.issue.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.count" ) @@ -639,7 +644,7 @@ test_that("Test add.vertex.attribute.issue.count", { # Test both lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.count( + networks.with.attr = add.vertex.attribute.author.issue.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "all" ) @@ -650,8 +655,8 @@ test_that("Test add.vertex.attribute.issue.count", { }) -#' Test the add.vertex.attribute.issues.commented.count method -test_that("Test add.vertex.attribute.issues.commented.count", { +#' Test the add.vertex.attribute.author.issues.commented.count method +test_that("Test add.vertex.attribute.author.issues.commented.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") @@ -678,7 +683,7 @@ test_that("Test add.vertex.attribute.issues.commented.count", { ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issues.commented.count( + networks.with.attr = add.vertex.attribute.author.issues.commented.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) @@ -690,7 +695,7 @@ test_that("Test add.vertex.attribute.issues.commented.count", { # Test PRs only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issues.commented.count( + networks.with.attr = add.vertex.attribute.author.issues.commented.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.requests.commented.count" ) @@ -703,7 +708,7 @@ test_that("Test add.vertex.attribute.issues.commented.count", { # Test both lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issues.commented.count( + networks.with.attr = add.vertex.attribute.author.issues.commented.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "all" ) @@ -713,8 +718,8 @@ test_that("Test add.vertex.attribute.issues.commented.count", { }) }) -#' Test the add.vertex.attribute.issue.creation.count method -test_that("Test add.vertex.attribute.issue.creation.count", { +#' Test the add.vertex.attribute.author.issue.creation.count method +test_that("Test add.vertex.attribute.author.issue.creation.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") @@ -741,7 +746,7 @@ test_that("Test add.vertex.attribute.issue.creation.count", { ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.creation.count( + networks.with.attr = add.vertex.attribute.author.issue.creation.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) @@ -753,7 +758,7 @@ test_that("Test add.vertex.attribute.issue.creation.count", { # Test PRs only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.creation.count( + networks.with.attr = add.vertex.attribute.author.issue.creation.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.creation.count" ) @@ -766,7 +771,7 @@ test_that("Test add.vertex.attribute.issue.creation.count", { # Test both lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.creation.count( + networks.with.attr = add.vertex.attribute.author.issue.creation.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "all" ) @@ -776,8 +781,8 @@ test_that("Test add.vertex.attribute.issue.creation.count", { }) }) -#' Test the add.vertex.attribute.issue.comment.count method -test_that("Test add.vertex.attribute.issue.comment.count", { +#' Test the add.vertex.attribute.author.issue.comment.count method +test_that("Test add.vertex.attribute.author.issue.comment.count", { ## Test setup networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") @@ -804,7 +809,7 @@ test_that("Test add.vertex.attribute.issue.comment.count", { ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.comment.count( + networks.with.attr = add.vertex.attribute.author.issue.comment.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) @@ -816,7 +821,7 @@ test_that("Test add.vertex.attribute.issue.comment.count", { # Test PRs only lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.comment.count( + networks.with.attr = add.vertex.attribute.author.issue.comment.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.comment.count" ) @@ -829,7 +834,7 @@ test_that("Test add.vertex.attribute.issue.comment.count", { # Test both lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.issue.comment.count( + networks.with.attr = add.vertex.attribute.author.issue.comment.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "all" ) @@ -863,8 +868,8 @@ test_that("Test add.vertex.attribute.author.email", { expect_identical(expected.attributes, actual.attributes) }) -#' Test the add.vertex.attribute.artifact.count method -test_that("Test add.vertex.attribute.artifact.count", { +#' Test the add.vertex.attribute.author.artifact.count method +test_that("Test add.vertex.attribute.author.artifact.count", { ## Test setup @@ -882,7 +887,7 @@ test_that("Test add.vertex.attribute.artifact.count", { ## Test lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attr = add.vertex.attribute.artifact.count( + networks.with.attr = add.vertex.attribute.author.artifact.count( networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level ) @@ -892,8 +897,8 @@ test_that("Test add.vertex.attribute.artifact.count", { }) }) -#' Test the add.vertex.attribute.first.activity method with computation over all types. -test_that("Test add.vertex.attribute.first.activity with multiple types and computation over all types", { +#' Test the add.vertex.attribute.author.first.activity method with computation over all types. +test_that("Test add.vertex.attribute.author.first.activity with multiple types and computation over all types", { ## Test setup @@ -968,7 +973,7 @@ test_that("Test add.vertex.attribute.first.activity with multiple types and comp lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attributes = add.vertex.attribute.first.activity( + networks.with.attributes = add.vertex.attribute.author.first.activity( list.of.networks = networks.and.data[["networks"]], project.data = networks.and.data[["project.data"]], activity.types = c("mails", "commits", "issues"), name = "first.activity", aggregation.level = level, default.value = NA, combine.activity.types = TRUE @@ -979,8 +984,8 @@ test_that("Test add.vertex.attribute.first.activity with multiple types and comp }) }) -#' Test the add.vertex.attribute.first.activity method with multiple activity types and computation per type. -test_that("Test add.vertex.attribute.first.activity with multiple types and computation per type", { +#' Test the add.vertex.attribute.author.first.activity method with multiple activity types and computation per type. +test_that("Test add.vertex.attribute.author.first.activity with multiple types and computation per type", { ## Test setup @@ -994,7 +999,7 @@ test_that("Test add.vertex.attribute.first.activity with multiple types and comp lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attributes = add.vertex.attribute.first.activity( + networks.with.attributes = add.vertex.attribute.author.first.activity( list.of.networks = networks.and.data[["networks"]], project.data = networks.and.data[["project.data"]], activity.types = c("mails", "commits", "issues"), name = "first.activity", aggregation.level = level, default.value = NA, combine.activity.types = FALSE @@ -1005,8 +1010,8 @@ test_that("Test add.vertex.attribute.first.activity with multiple types and comp }) }) -#' Test the add.vertex.attribute.first.activity method with one activity type and computation per type. -test_that("Test add.vertex.attribute.first.activity with one type and computation per type", { +#' Test the add.vertex.attribute.author.first.activity method with one activity type and computation per type. +test_that("Test add.vertex.attribute.author.first.activity with one type and computation per type", { ## Test setup @@ -1026,7 +1031,7 @@ test_that("Test add.vertex.attribute.first.activity with one type and computatio lapply(AGGREGATION.LEVELS, function(level) { - networks.with.attributes = add.vertex.attribute.first.activity( + networks.with.attributes = add.vertex.attribute.author.first.activity( list.of.networks = networks.and.data[["networks"]], project.data = networks.and.data[["project.data"]], activity.types = c("mails"), name = "first.activity", aggregation.level = level, default.value = NA, combine.activity.types = FALSE @@ -1037,8 +1042,8 @@ test_that("Test add.vertex.attribute.first.activity with one type and computatio }) }) -#' Test the add.vertex.attribute.active.ranges method with computation over all types -test_that("Test add.vertex.attribute.active.ranges with computation over all types", { +#' Test the add.vertex.attribute.author.active.ranges method with computation over all types +test_that("Test add.vertex.attribute.author.active.ranges with computation over all types", { ## Test setup networks.and.data = get.network.covariates.test.networks() @@ -1047,7 +1052,7 @@ test_that("Test add.vertex.attribute.active.ranges with computation over all typ networks.and.data$project.data$set.project.conf.entry("issues.locked", TRUE) ## Test - networks.with.attr = add.vertex.attribute.active.ranges( + networks.with.attr = add.vertex.attribute.author.active.ranges( networks.and.data[["networks"]], networks.and.data[["project.data"]], combine.activity.types = TRUE ) @@ -1064,8 +1069,8 @@ test_that("Test add.vertex.attribute.active.ranges with computation over all typ expect_identical(expected.attributes, actual.attributes) }) -#' Test default values for the add.vertex.attribute.active.ranges method -test_that("Test default values of add.vertex.attribute.active.ranges", { +#' Test default values for the add.vertex.attribute.author.active.ranges method +test_that("Test default values of add.vertex.attribute.author.active.ranges", { ## Test setup networks.and.data = get.network.covariates.test.networks() @@ -1078,7 +1083,7 @@ test_that("Test default values of add.vertex.attribute.active.ranges", { test.data = networks.and.data[["project.data"]] test.activity.types = c("mails", "issues") test.default.value = "test.default.value" - networks.with.attr = add.vertex.attribute.active.ranges(test.networks, test.data, + networks.with.attr = add.vertex.attribute.author.active.ranges(test.networks, test.data, activity.types = test.activity.types, default.value = test.default.value) actual.attributes = lapply(networks.with.attr, igraph:: get.vertex.attribute, name = "active.ranges") @@ -1311,6 +1316,64 @@ test_that("Test add.vertex.attribute.artifact.first.occurrence", { }) }) +#' Test the add.vertex.attribute.artifact.last.edited method +test_that("Test add.vertex.attribute.artifact.last.edited", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact") + + expected.attributes = list( + range = network.covariates.test.build.expected( + c("2016-07-12 15:58:59 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ), + cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:59 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ), + all.ranges = network.covariates.test.build.expected( + c("2016-07-12 16:00:45 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ), + project.cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:59 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ), + project.all.ranges = network.covariates.test.build.expected( + c("2016-07-12 16:00:45 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ), + complete = network.covariates.test.build.expected( + c("2016-07-12 16:00:45 UTC"), c("2016-07-12 16:00:45 UTC"), + c("2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC") + ) + ) + + ## convert date strings to POSIXct + expected.attributes = lapply(expected.attributes, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.artifact.last.edited( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "last.edited") + + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_equal(expected.attributes[[level]], actual.attributes) + }) +}) + #' Test the add.vertex.attribute.artifact.change.count method test_that("Test add.vertex.attribute.artifact.change.count", { @@ -1347,6 +1410,1037 @@ test_that("Test add.vertex.attribute.artifact.change.count", { }) }) +## Unit tests for mail artifact networks + +#' mail thread contributor count +test_that("Test add.vertex.attribute.mail.thread.contributor.count", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "mail") + + expected.attributes = list( + range = network.covariates.test.build.expected( + c(2), c(1), c(1) + ), + cumulative = network.covariates.test.build.expected( + c(2), c(1), c(2) + ), + all.ranges = network.covariates.test.build.expected( + c(2), c(2), c(2) + ), + project.cumulative = network.covariates.test.build.expected( + c(2), c(1), c(2) + ), + project.all.ranges = network.covariates.test.build.expected( + c(2), c(2), c(2) + ), + complete = network.covariates.test.build.expected( + c(2), c(2), c(2) + ) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.mail.thread.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "thread.contributor.count") + + expect_equal(expected.attributes[[level]], actual.attributes) + }) +}) + +#' mail thread message count +test_that("Test add.vertex.attribute.mail.thread.message.count", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "mail") + + expected.attributes = list( + range = network.covariates.test.build.expected( + c(2), c(1), c(1) + ), + cumulative = network.covariates.test.build.expected( + c(2), c(1), c(2) + ), + all.ranges = network.covariates.test.build.expected( + c(2), c(2), c(2) + ), + project.cumulative = network.covariates.test.build.expected( + c(2), c(1), c(2) + ), + project.all.ranges = network.covariates.test.build.expected( + c(2), c(2), c(2) + ), + complete = network.covariates.test.build.expected( + c(2), c(2), c(2) + ) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.mail.thread.message.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "thread.message.count") + + expect_equal(expected.attributes[[level]], actual.attributes) + }) +}) + +#' mail thread start date +test_that("Test add.vertex.attribute.mail.thread.start.date", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "mail") + + expected.attributes = list( + range = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:05:37 UTC") + ), + cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:04:40 UTC") + ), + all.ranges = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:04:40 UTC") + ), + project.cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:04:40 UTC") + ), + project.all.ranges = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:04:40 UTC") + ), + complete = network.covariates.test.build.expected( + c("2016-07-12 15:58:40 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:04:40 UTC") + ) + ) + + ## convert date strings to POSIXct + expected.attributes = lapply(expected.attributes, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.mail.thread.start.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "thread.start.date") + + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_equal(expected.attributes[[level]], actual.attributes) + }) +}) + +#' mail thread end date +test_that("Test add.vertex.attribute.mail.thread.end.date", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "mail") + + expected.attributes = list( + range = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:05:37 UTC") + ), + cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:05:37 UTC") + ), + all.ranges = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:05:37 UTC"), c("2016-07-12 16:05:37 UTC") + ), + project.cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:04:40 UTC"), c("2016-07-12 16:05:37 UTC") + ), + project.all.ranges = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:05:37 UTC"), c("2016-07-12 16:05:37 UTC") + ), + complete = network.covariates.test.build.expected( + c("2016-07-12 15:58:50 UTC"), c("2016-07-12 16:05:37 UTC"), c("2016-07-12 16:05:37 UTC") + ) + ) + + ## convert date strings to POSIXct + expected.attributes = lapply(expected.attributes, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.mail.thread.end.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "thread.end.date") + + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_equal(expected.attributes[[level]], actual.attributes) + }) +}) + +#' mail thread originating mailing list +test_that("Test add.vertex.attribute.mail.thread.originating.mailing.list", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "mail", bins = mybins.since.2010) + + expected.attributes = network.covariates.test.build.expected.since.2010( + c("42", "42", "42"), c("13"), c("13"), c("13") + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.mail.thread.originating.mailing.list( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, + name = "thread.originating.mailing.list") + + expect_equal(expected.attributes, actual.attributes) + }) +}) + +## Unit tests for issue artifact networks + +#' Helper function for add.vertex.attribute.issue.* tests +#' +#' Merges expected data for issue networks, where attributes may be considered for issues only, PRs only, or both. +#' This function returns the data for both, given the data for issues only and PRs only, where \code{NA} is used for +#' vertices of the other type. +#' The data is given as lists of lists of vectors, and merged by replacing the \code{NA}-values of the first list +#' with the non-\code{NA} values of the second. Therefore, it only works if all vertices have a non-\code{NA} value in +#' exactly one of the two lists. +#' +#' @param expected.attributes.issues.only a list of lists of vectors, containing some value for each issue vertex and +#' \code{NA} for PR vertices +#' @param expected.attributes.prs.only a list of lists of vectors, containing some value for each PR vertex and +#' \code{NA} for issue vertices +#' +#' @return a list of lists of vectors, containing values for issue and PR vertices +merge.expected.attributes = function(expected.attributes.issues.only, expected.attributes.prs.only) { + result = lapply(names(expected.attributes.issues.only), function(n) { + issue.attr = expected.attributes.issues.only[[n]] + pr.attr = expected.attributes.prs.only[[n]] + sum.attr = lapply(names(issue.attr), function (n2) { + a = issue.attr[[n2]] + b = pr.attr[[n2]] + ## assign the non-NA values of b to the previously-NA values of a. + ## this only works properly if, at each index, exactly one of the vectors is NA. + a[is.na(a)] = b[!is.na(b)] + return(a) + }) + names(sum.attr) = names(issue.attr) + return(sum.attr) + }) + names(result) = names(expected.attributes.issues.only) + return(result) +} + +#' issue contributor count +test_that("Test add.vertex.attribute.issue.contributor.count", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 2L, 2L, 1L)), + cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), + all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), + project.all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), + complete = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, 4L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)) + ) + + expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.contributor.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +test_that("Test add.vertex.attribute.issue.contributor.count with issues.only.comments", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue", + issues.only.comments = TRUE) + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 1L, 1L), c(NA, 2L)), + cumulative = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 1L, 1L), c(NA, 2L)), + all.ranges = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 2L, 1L), c(NA, 2L)), + project.cumulative = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 1L, 1L), c(NA, 2L)), + project.all.ranges = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 2L, 1L), c(NA, 2L)), + complete = network.covariates.test.build.expected(c(NA, 1L), c(NA, NA, 2L, 2L), c(NA, 2L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(1L, NA), c(1L, 1L, NA, NA), c(1L, NA)), + cumulative = network.covariates.test.build.expected(c(1L, NA), c(2L, 1L, NA, NA), c(3L, NA)), + all.ranges = network.covariates.test.build.expected(c(3L, NA), c(3L, 1L, NA, NA), c(3L, NA)), + project.cumulative = network.covariates.test.build.expected(c(1L, NA), c(2L, 1L, NA, NA), c(3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(3L, NA), c(3L, 1L, NA, NA), c(3L, NA)), + complete = network.covariates.test.build.expected(c(3L, NA), c(3L, 1L, NA, NA), c(3L, NA)) + ) + + expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.contributor.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +test_that("Test add.vertex.attribute.issue.contributor.count with issues.only.comments and use.unfiltered.data", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue", + issues.only.comments = TRUE) + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 1L), c(NA, 2L)), + cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), + all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), + project.cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), + project.all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), + complete = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(1L, NA), c(1L, 2L, NA, NA), c(2L, NA)), + cumulative = network.covariates.test.build.expected(c(1L, NA), c(2L, 2L, NA, NA), c(3L, NA)), + all.ranges = network.covariates.test.build.expected(c(3L, NA), c(3L, 2L, NA, NA), c(3L, NA)), + project.cumulative = network.covariates.test.build.expected(c(1L, NA), c(2L, 2L, NA, NA), c(3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(3L, NA), c(3L, 2L, NA, NA), c(3L, NA)), + complete = network.covariates.test.build.expected(c(3L, NA), c(3L, 2L, NA, NA), c(3L, NA)) + ) + + expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "issues", use.unfiltered.data = TRUE + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests", use.unfiltered.data = TRUE) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.contributor.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.contributor.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "all", use.unfiltered.data = TRUE + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.contributor.count") + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +#' issue event count +test_that("Test add.vertex.attribute.issue.event.count", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 1L, NA), c(NA, 3L, 4L, 1L)), + cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), + all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), + project.all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), + complete = network.covariates.test.build.expected(c(8L, 7L, NA), c(NA, 6L, NA, 8L, NA), c(NA, 7L, 6L, 8L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(NA, NA, 2L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)) + ) + + expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.event.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.event.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.event.count") + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +#' issue comment count +test_that("Test add.vertex.attribute.issue.comment.count", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 0L, 4L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), + complete = network.covariates.test.build.expected(c(2L, 1L, NA), c(NA, 5L, NA, 2L, NA), c(NA, 1L, 5L, 2L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 1L, NA, 0L), c(1L, NA, NA, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)) + ) + + expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.comment.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.comment.event.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.comment.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.comment.event.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.comment.event.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.comment.event.count") + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +#' issue opened date +test_that("Test add.vertex.attribute.issue.opened.date", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = network.covariates.test.build.expected( + c("2016-07-12 14:30:13", + "2016-07-12 15:59:25", + NA), + c(NA, + "2016-07-12 16:01:30", + NA, + "2016-07-12 14:30:13", + NA), + c(NA, + "2016-07-12 15:59:25", + "2016-07-12 16:01:30", + "2016-07-12 14:30:13")) + + expected.attributes.prs.only = network.covariates.test.build.expected( + c(NA, + NA, + "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", + NA, + "2016-07-12 16:02:02", + NA, + "2016-07-12 14:59:25"), + c("2016-07-14 13:37:00", + NA, + NA, + NA)) + + expected.attributes.both = network.covariates.test.build.expected( + c("2016-07-12 14:30:13", + "2016-07-12 15:59:25", + "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", + "2016-07-12 16:01:30", + "2016-07-12 16:02:02", + "2016-07-12 14:30:13", + "2016-07-12 14:59:25"), + c("2016-07-14 13:37:00", + "2016-07-12 15:59:25", + "2016-07-12 16:01:30", + "2016-07-12 14:30:13")) + + ## convert date strings to POSIXct + expected.attributes.issues.only = lapply(expected.attributes.issues.only, function(date.vector) {get.date.from.string(date.vector)}) + + expected.attributes.prs.only = lapply(expected.attributes.prs.only, function(date.vector) {get.date.from.string(date.vector)}) + + expected.attributes.both = lapply(expected.attributes.both, function(date.vector) {get.date.from.string(date.vector)}) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.opened.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.opened.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.issues.only, actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.opened.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.opened.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.prs.only, actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.opened.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.opened.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.both, actual.attributes) + }) +}) + +#' issue closed date +test_that("Test add.vertex.attribute.issue.closed.date", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = network.covariates.test.build.expected( + c(NA, + "2016-07-12 16:06:30", + NA), + c(NA, + NA, + NA, + NA, + NA), + c(NA, + "2016-07-12 16:06:30", + NA, + NA)) + + expected.attributes.prs.only = network.covariates.test.build.expected( + c(NA, + NA, + NA), + c(NA, + NA, + NA, + NA, + "2016-07-12 16:04:59"), + c(NA, + NA, + NA, + NA)) + + expected.attributes.both = network.covariates.test.build.expected( + c(NA, + "2016-07-12 16:06:30", + NA), + c(NA, + NA, + NA, + NA, + "2016-07-12 16:04:59"), + c(NA, + "2016-07-12 16:06:30", + NA, + NA)) + + ## convert date strings to POSIXct + expected.attributes.issues.only = lapply(expected.attributes.issues.only, function(date.vector) {get.date.from.string(date.vector)}) + + expected.attributes.prs.only = lapply(expected.attributes.prs.only, function(date.vector) {get.date.from.string(date.vector)}) + + expected.attributes.both = lapply(expected.attributes.both, function(date.vector) {get.date.from.string(date.vector)}) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.closed.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.closed.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.issues.only, actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.closed.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.closed.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.prs.only, actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.closed.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.closed.date") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.both, actual.attributes) + }) +}) + +#' issue last activity date +test_that("Test add.vertex.attribute.issue.last.activity.date", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), + c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), + c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + all.ranges = network.covariates.test.build.expected( + c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), + c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + project.cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), + c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + project.all.ranges = network.covariates.test.build.expected( + c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), + c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + complete = network.covariates.test.build.expected( + c("2017-05-23 12:32:39", "2016-10-05 16:45:09", NA), + c(NA , "2016-07-28 06:27:52", NA , "2017-05-23 12:32:39", NA), + c(NA , "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected( + c(NA , NA , "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA)), + cumulative = network.covariates.test.build.expected( + c(NA , NA , "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA)), + all.ranges = network.covariates.test.build.expected( + c(NA , NA , "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA)), + project.cumulative = network.covariates.test.build.expected( + c(NA , NA , "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA)), + project.all.ranges = network.covariates.test.build.expected( + c(NA , NA , "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA)), + complete = network.covariates.test.build.expected( + c(NA , NA , "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", NA , NA , NA))) + + expected.attributes.both = list( + range = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + all.ranges = network.covariates.test.build.expected( + c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + project.cumulative = network.covariates.test.build.expected( + c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), + c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + project.all.ranges = network.covariates.test.build.expected( + c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + complete = network.covariates.test.build.expected( + c("2017-05-23 12:32:39", "2016-10-05 16:45:09", "2016-07-14 13:37:00"), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2017-05-23 12:32:39", "2016-07-12 16:04:59"), + c("2016-07-14 13:37:00", "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + + ## convert date strings to POSIXct + expected.attributes.issues.only = lapply(expected.attributes.issues.only, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + expected.attributes.prs.only = lapply(expected.attributes.prs.only, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + expected.attributes.both = lapply(expected.attributes.both, function(times) { + lapply(times, function(date.vector) { + get.date.from.string(date.vector) + }) + }) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.last.activity.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.last.activity") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.last.activity.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.last.activity") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.last.activity.date( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.last.activity") + ## convert UNIX timestamps to POSIXct + actual.attributes = lapply(actual.attributes, get.date.from.unix.timestamp) + + expect_identical(expected.attributes.both[[level]], actual.attributes) + }) +}) + +#' issue title +test_that("Test add.vertex.attribute.issue.title", { + ## Test setup + networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") + + expected.attributes.issues.only = network.covariates.test.build.expected( + c("Distinguish directedness of networks and edge-construction algorithm", + "Error in construct.networks.from.list for openssl function networks", + NA), + c(NA, + "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + NA, + "Distinguish directedness of networks and edge-construction algorithm", + NA), + c(NA, + "Error in construct.networks.from.list for openssl function networks", + "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + "Distinguish directedness of networks and edge-construction algorithm")) + + expected.attributes.prs.only = network.covariates.test.build.expected( + c(NA, + NA, + "Example pull request 1"), + c("Example pull request 1", + NA, + "Example pull request 4", + NA, + "Example pull request 2"), + c("Example pull request 1", + NA, + NA, + NA)) + + expected.attributes.both = network.covariates.test.build.expected( + c("Distinguish directedness of networks and edge-construction algorithm", + "Error in construct.networks.from.list for openssl function networks", + "Example pull request 1"), + c("Example pull request 1", + "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + "Example pull request 4", + "Distinguish directedness of networks and edge-construction algorithm", + "Example pull request 2"), + c("Example pull request 1", + "Error in construct.networks.from.list for openssl function networks", + "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + "Distinguish directedness of networks and edge-construction algorithm")) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.title( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.title") + + expect_identical(expected.attributes.issues.only, actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.title( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, + type = "pull.requests") + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pr.title") + + expect_identical(expected.attributes.prs.only, actual.attributes) + }) + + # Test both + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.title( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, type = "all" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.title") + + expect_identical(expected.attributes.both, actual.attributes) + }) +}) + +#' pull request state +test_that("Test add.vertex.attribute.pr.open.merged.or.closed", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") + + expected.attributes = network.covariates.test.build.expected( + c(NA, NA, "open"), c("open", NA, "open", NA, "merged"), c("open", NA, NA, NA) + ) + + ## Test + + networks.with.attr = add.vertex.attribute.pr.open.merged.or.closed( + networks.and.data[["networks"]], networks.and.data[["project.data"]]) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pull.request.state") + + expect_equal(expected.attributes, actual.attributes) +}) + +#' issue is pull request +test_that("Test add.vertex.attribute.issue.is.pull.request", { + + ## Test setup + + networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") + + expected.attributes = network.covariates.test.build.expected( + c(FALSE, FALSE, TRUE), c(TRUE, FALSE, TRUE, FALSE, TRUE), c(TRUE, FALSE, FALSE, FALSE) + ) + + ## Test + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.is.pull.request( + networks.and.data[["networks"]], networks.and.data[["project.data"]], + aggregation.level = level + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.is.pull.request") + + expect_equal(expected.attributes, actual.attributes) + }) +}) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Unit tests for empty attribute data ------------------------------------- @@ -1367,7 +2461,7 @@ test_that("Test addition of attributes despite of empty data", { names(networks) = range ## add commit-count attribute - net.commit.count = add.vertex.attribute.commit.count.author(networks, proj.data.empty, default = 0L)[[1]] + net.commit.count = add.vertex.attribute.author.commit.count(networks, proj.data.empty, default = 0L)[[1]] expect_true("commit.count" %in% igraph::list.vertex.attributes(net.commit.count)) ## add author-role attribute: @@ -1399,7 +2493,7 @@ test_that("Test addition of attributes despite of non-captured vertices", { names(networks) = range ## add commit-count attribute - net.commit.count = add.vertex.attribute.commit.count.committer.and.author(networks, proj.data.empty, default = 0L)[[1]] + net.commit.count = add.vertex.attribute.author.commit.count.committer.and.author(networks, proj.data.empty, default = 0L)[[1]] ## check existence and proper value expect_true("commit.count.committer.and.author" %in% igraph::list.vertex.attributes(net.commit.count)) diff --git a/tests/test-networks-cut.R b/tests/test-networks-cut.R index 8baf7421..c959b097 100644 --- a/tests/test-networks-cut.R +++ b/tests/test-networks-cut.R @@ -69,7 +69,7 @@ test_that("Cut commit and mail data to same date range.", { date = get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37")), date.offset = as.integer(c(100, 200)), subject = c("Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), - thread = sprintf("", c(9, 9)), + thread = sprintf("", c("13#9", "13#9")), artifact.type = c("Mail", "Mail")) commit.data = x$get.project.data()$get.commits.unfiltered() diff --git a/tests/test-networks-equal-constructions.R b/tests/test-networks-equal-constructions.R index 9f254b9a..feb3f7d2 100644 --- a/tests/test-networks-equal-constructions.R +++ b/tests/test-networks-equal-constructions.R @@ -14,6 +14,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Claus Hunsen ## Copyright 2020 by Thomas Bock +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -33,74 +34,60 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") #' Compare the edges and vertices of the corresponding networks constructed in different ways. #' -#' @param split.author.networks.one the first list of split author networks -#' @param split.author.networks.two the second list of split author networks -#' @param split.bipartite.networks.one the first list of split bipartite networks -#' @param split.bipartite.networks.two the second list of split bipartite networks -compare.edge.and.vertex.lists = function(split.author.networks.one = NULL, split.author.networks.two = NULL, - split.bipartite.networks.one = NULL, split.bipartite.networks.two = NULL) { - - for (i in seq_along(split.author.networks.one)) { - author.edges.one = igraph::get.data.frame(split.author.networks.one[[i]], what = "edges") - ordering = order(author.edges.one[["from"]], author.edges.one[["to"]], - author.edges.one[["date"]]) - author.edges.one = author.edges.one[ordering, ] - rownames(author.edges.one) = seq_len(nrow(author.edges.one)) - author.edges.two = igraph::get.data.frame(split.author.networks.two[[i]], what = "edges") - ordering = order(author.edges.two[["from"]], author.edges.two[["to"]], - author.edges.two[["date"]]) - author.edges.two = author.edges.two[ordering, ] - rownames(author.edges.two) = seq_len(nrow(author.edges.two)) - author.vertices.one = igraph::get.data.frame(split.author.networks.one[[i]], what = "vertices") - ordering = order(author.vertices.one[["name"]]) - author.vertices.one = author.vertices.one[ordering, ] - rownames(author.vertices.one) = seq_len(nrow(author.vertices.one)) - author.vertices.two = igraph::get.data.frame(split.author.networks.two[[i]], what = "vertices") - ordering = order(author.vertices.two[["name"]]) - author.vertices.two = author.vertices.two[ordering, ] - rownames(author.vertices.two) = seq_len(nrow(author.vertices.two)) - - expect_identical(author.edges.one, author.edges.two) - expect_identical(author.vertices.one, author.vertices.two) - - bipartite.edges.one = igraph::get.data.frame(split.bipartite.networks.one[[i]], what = "edges") - ordering = order(bipartite.edges.one[["from"]], bipartite.edges.one[["to"]], - bipartite.edges.one[["date"]]) - bipartite.edges.one = bipartite.edges.one[ordering, ] - rownames(bipartite.edges.one) = seq_len(nrow(bipartite.edges.one)) - bipartite.edges.two = igraph::get.data.frame(split.bipartite.networks.two[[i]], what = "edges") - ordering = order(bipartite.edges.two[["from"]], bipartite.edges.two[["to"]], - bipartite.edges.two[["date"]]) - bipartite.edges.two = bipartite.edges.two[ordering, ] - rownames(bipartite.edges.two) = seq_len(nrow(bipartite.edges.two)) - bipartite.vertices.one = igraph::get.data.frame(split.bipartite.networks.one[[i]], what = "vertices") - ordering = order(bipartite.vertices.one[["name"]]) - bipartite.vertices.one = bipartite.vertices.one[ordering, ] - rownames(bipartite.vertices.one) = seq_len(nrow(bipartite.vertices.one)) - bipartite.vertices.two = igraph::get.data.frame(split.bipartite.networks.two[[i]], what = "vertices") - ordering = order(bipartite.vertices.two[["name"]]) - bipartite.vertices.two = bipartite.vertices.two[ordering, ] - rownames(bipartite.vertices.two) = seq_len(nrow(bipartite.vertices.two)) - - expect_identical(bipartite.edges.one, bipartite.edges.two) - expect_identical(bipartite.vertices.one, bipartite.vertices.two) +#' @param split.networks.one the first list of split networks +#' @param split.networks.two the second list of split networks +compare.edge.and.vertex.lists = function(split.networks.one, split.networks.two) { + + for (i in seq_along(split.networks.one)) { + edges.one = igraph::get.data.frame(split.networks.one[[i]], what = "edges") + ordering = order(edges.one[["from"]], edges.one[["to"]], + edges.one[["date"]]) + edges.one = edges.one[ordering, ] + rownames(edges.one) = seq_len(nrow(edges.one)) + edges.two = igraph::get.data.frame(split.networks.two[[i]], what = "edges") + ordering = order(edges.two[["from"]], edges.two[["to"]], + edges.two[["date"]]) + edges.two = edges.two[ordering, ] + rownames(edges.two) = seq_len(nrow(edges.two)) + vertices.one = igraph::get.data.frame(split.networks.one[[i]], what = "vertices") + ordering = order(vertices.one[["name"]]) + vertices.one = vertices.one[ordering, ] + rownames(vertices.one) = seq_len(nrow(vertices.one)) + vertices.two = igraph::get.data.frame(split.networks.two[[i]], what = "vertices") + ordering = order(vertices.two[["name"]]) + vertices.two = vertices.two[ordering, ] + rownames(vertices.two) = seq_len(nrow(vertices.two)) + + expect_identical(edges.one, edges.two) + expect_identical(vertices.one, vertices.two) } } -patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways - with author/artifact relation 'cochange', ", { +## Test that splitting a multi network, then extracting networks and +## extracting networks first, then using split.networks.time.based results in +## the same networks. +## Note that this is only the case if both the author and the bipartite network are included, +## as otherwise, the multi network might cover a different time period and therefore split differently. +## Including the artifact network is optional, as every edge in the artifact network +## will have a corresponding edge in the author network. +patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) net.conf = NetworkConf$new() - net.conf$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) + net.conf$update.values(updated.values = list(author.relation = test.author.relation, + artifact.relation = test.artifact.relation, + author.all.authors = TRUE)) + net.conf$clear.edge.attributes() ## construct objects proj.data = ProjectData$new(project.conf = proj.conf) network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) - splitting.period = "3 min" + ## splitting by time.period would require different periods for different network relations + ## to produce reasonable results in reasonable time + number.windows = 5 ## network generation 1 author.network = network.builder$get.author.network() @@ -108,7 +95,9 @@ patrick::with_parameters_test_that("Compare the bipartite and author network con ## split the networks split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = test.sliding.window) + number.windows = number.windows, + sliding.window = test.sliding.window, + remove.isolates = FALSE) ## separate the author and bipartite networks split.author.networks.one = split.networks[[1]] @@ -118,16 +107,17 @@ patrick::with_parameters_test_that("Compare the bipartite and author network con multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, - sliding.window = test.sliding.window) + multi.network.split = split.network.time.based(network = multi.network, number.windows = number.windows, + sliding.window = test.sliding.window, + remove.isolates = FALSE) split.author.networks.two = list() split.bipartite.networks.two = list() - ## extract the author and bipartite networks from the splitted multi networks + ## extract the author and bipartite networks from the split multi networks for (i in seq_along(multi.network.split)) { - author.net = extract.author.network.from.network(multi.network.split[[i]], remove.isolates = TRUE) - bipartite.net = extract.bipartite.network.from.network(multi.network.split[[i]]) + author.net = extract.author.network.from.network(multi.network.split[[i]], remove.isolates = FALSE) + bipartite.net = extract.bipartite.network.from.network(multi.network.split[[i]], remove.isolates = FALSE) split.author.networks.two[[i]] = author.net split.bipartite.networks.two[[i]] = bipartite.net @@ -135,73 +125,113 @@ patrick::with_parameters_test_that("Compare the bipartite and author network con ## compare the edges and the vertices of all the author and bipartite networks that were previously ## created with different approaches - compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, - split.bipartite.networks.one, split.bipartite.networks.two) -}, patrick::cases( - "sliding window: FALSE" = list(test.sliding.window = FALSE), - "sliding window: TRUE" = list(test.sliding.window = TRUE) + compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two) + compare.edge.and.vertex.lists(split.bipartite.networks.one, split.bipartite.networks.two) +}, cases.cross.product( + cases.cross.product( + patrick::cases( + "with author relation 'cochange'" = list(test.author.relation = 'cochange'), + "with author relation 'mail'" = list(test.author.relation = 'mail'), + "with author relation 'issue'" = list(test.author.relation = 'issue') + ), + patrick::cases( + "artifact relation 'cochange'" = list(test.artifact.relation = 'cochange'), + "artifact relation 'mail'" = list(test.artifact.relation = 'mail'), + "artifact relation 'issue'" = list(test.artifact.relation = 'issue') + ) + ), + patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ) )) -patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways - with author relation 'mail' and artifact relation 'cochange', ", { +patrick::with_parameters_test_that("Compare the author, artifact and bipartite network constructed in two ways", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) net.conf = NetworkConf$new() - net.conf$update.values(updated.values = list(author.relation = "mail", artifact.relation = "cochange")) + net.conf$update.values(updated.values = list(author.relation = test.author.relation, + artifact.relation = test.artifact.relation, + author.all.authors = TRUE)) net.conf$clear.edge.attributes() ## construct objects proj.data = ProjectData$new(project.conf = proj.conf) network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) - splitting.period = "3 min" + ## splitting by time.period would require different periods for different network relations + ## to produce reasonable results in reasonable time + number.windows = 5 ## network generation 1 - author.network = network.builder$get.author.network() bipartite.network = network.builder$get.bipartite.network() + artifact.network = network.builder$get.artifact.network() + author.network = network.builder$get.author.network() ## split the networks - split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = test.sliding.window) + split.networks = split.networks.time.based(networks = list(bipartite.network, artifact.network, + author.network), + number.windows = number.windows, + sliding.window = test.sliding.window, + remove.isolates = FALSE) - ## separate the author and bipartite networks - split.author.networks.one = split.networks[[1]] - split.bipartite.networks.one = split.networks[[2]] + ## separate the bipartite and artifact networks + split.bipartite.networks.one = split.networks[[1]] + split.artifact.networks.one = split.networks[[2]] + split.author.networks.one = split.networks[[3]] ## network generation 2 multi.network = network.builder$get.multi.network() ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, - sliding.window = test.sliding.window) + multi.network.split = split.network.time.based(network = multi.network, number.windows = number.windows, + sliding.window = test.sliding.window, + remove.isolates = FALSE) - split.author.networks.two = list() split.bipartite.networks.two = list() + split.artifact.networks.two = list() + split.author.networks.two = list() - ## extract the author and bipartite networks from the splitted multi networks + ## extract the bipartite and artifact networks from the split multi networks for (i in seq_along(multi.network.split)) { - author.net = extract.author.network.from.network(multi.network.split[[i]], remove.isolates = TRUE) - bipartite.net = extract.bipartite.network.from.network(multi.network.split[[i]]) + bipartite.net = extract.bipartite.network.from.network(multi.network.split[[i]], remove.isolates = FALSE) + artifact.net = extract.artifact.network.from.network(multi.network.split[[i]], remove.isolates = FALSE) + author.net = extract.author.network.from.network(multi.network.split[[i]], remove.isolates = FALSE) - split.author.networks.two[[i]] = author.net split.bipartite.networks.two[[i]] = bipartite.net + split.artifact.networks.two[[i]] = artifact.net + split.author.networks.two[[i]] = author.net } - ## compare the edges and the vertices of all the author and bipartite networks that were previously + ## compare the edges and the vertices of all the bipartite and artifact networks that were previously ## created with different approaches - compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, - split.bipartite.networks.one, split.bipartite.networks.two) -}, patrick::cases( - "sliding window: FALSE" = list(test.sliding.window = FALSE), - "sliding window: TRUE" = list(test.sliding.window = TRUE) + compare.edge.and.vertex.lists(split.bipartite.networks.one, split.bipartite.networks.two) + compare.edge.and.vertex.lists(split.artifact.networks.one, split.artifact.networks.two) + compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two) +}, cases.cross.product( + cases.cross.product( + patrick::cases( + "with author relation 'cochange'" = list(test.author.relation = 'cochange'), + "with author relation 'mail'" = list(test.author.relation = 'mail'), + "with author relation 'issue'" = list(test.author.relation = 'issue') + ), + patrick::cases( + "artifact relation 'cochange'" = list(test.artifact.relation = 'cochange'), + "artifact relation 'mail'" = list(test.artifact.relation = 'mail'), + "artifact relation 'issue'" = list(test.artifact.relation = 'issue') + ) + ), + patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ) )) -patrick::with_parameters_test_that("Compare the bipartite and author network constructed in two ways - with author and artifact relation 'mail', ", { - +## Vertex attribute order +test_that("Compare networks after adding vertex attributes in different order", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) @@ -213,45 +243,69 @@ patrick::with_parameters_test_that("Compare the bipartite and author network con proj.data = ProjectData$new(project.conf = proj.conf) network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) - splitting.period = "3 year" + author.network = network.builder$get.author.network() + networks = split.network.time.based(author.network, number.windows = 2) + + ## add commit count or email attribute + networks.commit.count = add.vertex.attribute.author.commit.count(networks, proj.data, aggregation.level = "range") + networks.email = add.vertex.attribute.author.email(networks, proj.data) + + ## add the other attribute + networks.both.1 = add.vertex.attribute.author.email(networks.commit.count, proj.data) + networks.both.2 = add.vertex.attribute.author.commit.count(networks.email, proj.data, aggregation.level = "range") + + ## Order of attributes is now different, while the content is the same. + ## The resulting networks are therefore not equal. + expect_false(compare(networks.both.1, networks.both.2)$equal) +}) + +## Vertex attribute added twice +test_that("Compare networks after adding vertex attribute once or twice", { + ## configuration object for the datapath + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(author.relation = "mail", artifact.relation = "mail")) + net.conf$clear.edge.attributes() + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) - ## network generation 1 author.network = network.builder$get.author.network() - bipartite.network = network.builder$get.bipartite.network() + networks = split.network.time.based(author.network, number.windows = 2) - ## split the networks - split.networks = split.networks.time.based(networks = list(author.network, bipartite.network), - time.period = splitting.period, sliding.window = test.sliding.window) + ## add email attribute + networks.email = add.vertex.attribute.author.email(networks, proj.data) - ## separate the author and bipartite networks - split.author.networks.one = split.networks[[1]] - split.bipartite.networks.one = split.networks[[2]] + ## add email attribute again + networks.email.twice = add.vertex.attribute.author.email(networks.email, proj.data) - ## network generation 2 - multi.network = network.builder$get.multi.network() + ## the attribute should only be contained once, so the resulting graphs should be equal + compare.edge.and.vertex.lists(networks.email, networks.email.twice) - ## split the network - multi.network.split = split.network.time.based(network = multi.network, time.period = splitting.period, - sliding.window = test.sliding.window) +}) - split.author.networks.two = list() - split.bipartite.networks.two = list() +## Edge attribute order +test_that("Compare networks after adding edge attributes in different order", { + ## configuration object for the datapath + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(author.relation = "mail", artifact.relation = "mail")) + net.conf$clear.edge.attributes() - ## extract the author and bipartite networks from the splitted multi networks - for (i in seq_along(multi.network.split)) { - author.net = extract.author.network.from.network(multi.network.split[[i]], remove.isolates = TRUE) - bipartite.net = extract.bipartite.network.from.network(multi.network.split[[i]]) + ## construct two networks with the edge attributes specified in different orders + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + network.builder$update.network.conf(updated.values = list(edge.attributes = c("author.name", "author.email"))) + author.network = network.builder$get.author.network() + networks.1 = split.network.time.based(author.network, number.windows = 2) - split.author.networks.two[[i]] = author.net - split.bipartite.networks.two[[i]] = bipartite.net - } + network.builder$update.network.conf(updated.values = list(edge.attributes = c("author.email", "author.name"))) + author.network = network.builder$get.author.network() + networks.2 = split.network.time.based(author.network, number.windows = 2) - ## compare the edges and the vertices of all the author and bipartite networks that were previously - ## created with different approaches - compare.edge.and.vertex.lists(split.author.networks.one, split.author.networks.two, - split.bipartite.networks.one, split.bipartite.networks.two) -}, patrick::cases( - "sliding window: FALSE" = list(test.sliding.window = FALSE), - "sliding window: TRUE" = list(test.sliding.window = TRUE) -)) + ## edge attributes should be sorted, so the resulting networks should be the same + compare.edge.and.vertex.lists(networks.1, networks.2) +}) diff --git a/tests/test-networks-multi-relation.R b/tests/test-networks-multi-relation.R index 3d1aeb72..b264ec70 100644 --- a/tests/test-networks-multi-relation.R +++ b/tests/test-networks-multi-relation.R @@ -18,6 +18,7 @@ ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -85,7 +86,7 @@ test_that("Network construction of the undirected author network with relation = "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>", "<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>"), thread = c(NA, NA, NA, NA, NA, NA, NA, NA, - "", "", "", "") + "", "", "", "") ) ## build expected network @@ -131,8 +132,8 @@ test_that("Construction of the bipartite network for the feature artifact with a type = TYPE.ARTIFACT ) threads = data.frame( - name = c("", "", "", - "", "", "", "", "", ""), + name = c("", "", "", + "", "", "", "", "", ""), kind = "MailThread", type = TYPE.ARTIFACT ) @@ -149,9 +150,9 @@ test_that("Construction of the bipartite network for the feature artifact with a "", "", "", "", "", "", "", "", "", "", "", "", "", - "", "", "", "", "", "", "", # mail - "", "", "", "", "", "", "", - "", ""), + "", "", "", "", "", "", "", # mail + "", "", "", "", "", "", "", + "", ""), date = get.date.from.string(c("2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", # issue "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2016-07-12 14:59:25", "2016-07-12 16:02:30", "2016-07-12 16:06:01", @@ -177,9 +178,9 @@ test_that("Construction of the bipartite network for the feature artifact with a "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", "<65a1sf31sagd684dfv31@mail.gmail.com>", ""), thread = c(rep(NA, 24), - "", "", "", "", "", "", - "", "", "", "", "", "", - "", "", "", ""), + "", "", "", "", "", "", + "", "", "", "", "", "", + "", "", "", ""), issue.id = c("", "", "", "", # issue "", "", "", "", "", "", "", @@ -295,7 +296,7 @@ test_that("Construction of the multi network for the feature artifact with autho "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", rep(NA, 29)), thread = c(rep(NA, 8), - "", "", "", "", + "", "", "", "", rep(NA, 29)), issue.id = c(rep(NA, 20), "", "", "", "", # bipartite issue @@ -309,14 +310,410 @@ test_that("Construction of the multi network for the feature artifact with autho network.expected = igraph::graph.data.frame(edges, vertices = vertices, directed = net.conf$get.value("author.directed")) - expected.edges = igraph::as_data_frame(network.expected, what = "edges") - expected.vertices = igraph::as_data_frame(network.expected, what = "vertices") + compare.networks(network.expected, network.built) +}) + +test_that("Construction of the multi-artifact bipartite network with artifact relations 'cochange' and 'issue'", { + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf.cochange = NetworkConf$new() + net.conf.cochange$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) + net.conf.issue = NetworkConf$new() + net.conf.issue$update.values(updated.values = list(author.relation = "issue", artifact.relation = "issue")) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder.cochange = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.cochange) + network.builder.issue = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.issue) + + ## build a multi-artifact network by merging two different artifact networks + net.cochange = network.builder.cochange$get.bipartite.network() + net.issue = network.builder.issue$get.bipartite.network() + + net.combined = merge.networks(list(net.cochange, net.issue)) + + ## build expected network + vertices = data.frame(name = c("Björn", "Karl", "Olaf", "Thomas", "A", "Base_Feature", + "foo", "Max", "", + "", "", + "", "", "", + ""), + kind = c(rep(TYPE.AUTHOR, 4), rep("Feature", 3), TYPE.AUTHOR, rep("Issue", 7)), + type = c(rep(TYPE.AUTHOR, 4), rep(TYPE.ARTIFACT, 3), TYPE.AUTHOR, rep(TYPE.ARTIFACT, 7)) + ) + row.names(vertices) = c("Björn", "Karl", "Olaf", "Thomas", "A", "Base_Feature", + "foo", "Max", "", + "", "", + "", "", "", + "") + + edges = data.frame( + from = c("Björn", "Karl", "Olaf", "Olaf", "Thomas", "Thomas", "Björn", + "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", + "Björn", "Björn", "Björn", "Karl", "Max", "Max", "Max", + "Olaf", "Olaf", "Olaf", "Olaf", "Olaf", "Olaf", "Thomas", + "Thomas", "Thomas"), + to = c("A", "Base_Feature", "A", + "Base_Feature", "Base_Feature", "foo", + "","","", + "","","", + "", "","", + "","", "", + "","","", + "","","", + "","", "", + "","", ""), + date = get.date.from.string(c("2016-07-12 15:58:59 UTC", "2016-07-12 16:06:10 UTC", + "2016-07-12 16:00:45 UTC", "2016-07-12 16:05:41 UTC", + "2016-07-12 16:06:32 UTC", "2016-07-12 16:06:32 UTC", + "2013-05-05 21:46:30 UTC", "2013-05-05 21:49:21 UTC", + "2013-05-05 21:49:34 UTC", "2013-05-06 01:04:34 UTC", + "2013-05-25 03:48:41 UTC", "2013-05-25 04:08:07 UTC", + "2016-07-12 14:59:25 UTC", "2016-07-12 16:02:30 UTC", + "2016-07-12 16:06:01 UTC", "2016-07-15 19:55:39 UTC", + "2017-05-23 12:32:39 UTC", "2016-07-12 15:59:59 UTC", + "2016-07-15 20:07:47 UTC", "2016-07-27 20:12:08 UTC", + "2016-07-28 06:27:52 UTC", "2013-05-25 03:25:06 UTC", + "2013-05-25 06:06:53 UTC", "2013-05-25 06:22:23 UTC", + "2013-06-01 06:50:26 UTC", "2016-07-12 16:01:01 UTC", + "2016-07-12 16:02:02 UTC", "2013-04-21 23:52:09 UTC", + "2016-07-12 15:59:25 UTC", "2016-07-12 16:03:59 UTC")), + artifact.type = c(rep("Feature", 6), rep("IssueEvent", 24)), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "1143db502761379c2bfcecc2007fc34282e7ee61", + "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", + "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", + rep(NA, 24)), + file = c("test.c", "test3.c", "test.c", "test2.c", "test2.c", "test2.c", rep(NA, 24)), + artifact = c("A", "Base_Feature", "A", "Base_Feature", "Base_Feature", "foo", rep(NA, 24)), + weight = c(rep(1, 30)), + type = c(rep("Bipartite", 30)), + relation = c(rep("cochange", 6), rep("issue", 24)), + issue.id = c(NA, NA, NA, + NA, NA, NA, + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", ""), + event.name = c(rep(NA, 6), rep("commented", 24)) + ) + + net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + compare.networks(net.expected, net.combined) + +}) + +test_that("Construction of the multi-artifact bipartite network with artifact relations 'cochange' and 'mail'", { + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf.cochange = NetworkConf$new() + net.conf.cochange$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) + net.conf.mail = NetworkConf$new() + net.conf.mail$update.values(updated.values = list(author.relation = "mail", artifact.relation = "mail")) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder.cochange = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.cochange) + network.builder.mail = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.mail) + + ## build a multi-artifact network by merging two different artifact networks + net.cochange = network.builder.cochange$get.bipartite.network() + net.mail = network.builder.mail$get.bipartite.network() + + net.combined = merge.networks(list(net.cochange, net.mail)) + + ## build expected network + vertices = data.frame(name = c("Björn", "Karl", "Olaf", + "Thomas", "A", "Base_Feature", + "foo", "Fritz fritz@example.org","georg", + "Hans", "udo", "", + "", "", "", + "", "", "", + "", ""), + kind = c(rep(TYPE.AUTHOR, 4), rep("Feature", 3), rep(TYPE.AUTHOR, 4), rep("MailThread", 9)), + type = c(rep(TYPE.AUTHOR, 4), rep(TYPE.ARTIFACT, 3), rep(TYPE.AUTHOR, 4), rep(TYPE.ARTIFACT, 9)) + ) + row.names(vertices) = c("Björn", "Karl", "Olaf", + "Thomas", "A", "Base_Feature", + "foo", "Fritz fritz@example.org","georg", + "Hans", "udo", "", + "", "", "", + "", "", "", + "", "") + + edges = data.frame( + from = c("Björn", "Karl", "Olaf", "Olaf", "Thomas", "Thomas", "Björn", "Björn", + "Björn", "Fritz fritz@example.org", "georg", "Hans", "Hans", "Hans", + "Hans", "Hans", "Hans", "Hans", "Olaf", "Olaf", "Thomas", "udo"), + to = c("A", "Base_Feature", "A", "Base_Feature", "Base_Feature", "foo", "", + "", "", "", "", "", + "", "", "", "", "", + "", "", "", "", ""), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:00:45", + "2016-07-12 16:05:41", "2016-07-12 16:06:32", "2016-07-12 16:06:32", + "2004-10-09 18:38:13", "2005-02-09 18:49:49", "2016-07-12 15:58:40", + "2010-07-12 11:05:35", "2010-07-12 12:05:34", "2010-07-12 12:05:40", + "2010-07-12 12:05:41", "2010-07-12 12:05:42", "2010-07-12 12:05:43", + "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2010-07-12 12:05:46", + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:04:40", + "2010-07-12 10:05:36")), + artifact.type = c(rep("Feature", 6), rep("Mail", 16)), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "1143db502761379c2bfcecc2007fc34282e7ee61", + "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", + "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", + rep(NA, 16)), + file = c("test.c", "test3.c", "test.c", "test2.c", "test2.c", "test2.c", rep(NA, 16)), + artifact = c("A", "Base_Feature", "A", "Base_Feature", "Base_Feature", "foo", rep(NA, 16)), + weight = rep(1,22), + type = rep("Bipartite", 22), + relation = c(rep("cochange", 6), rep("mail", 16)), + message.id = c(rep(NA, 6), "", + "<1107974989.17910.6.camel@jmcmullan>", "<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", + "", "", + "", "", "", + "", "", "", + "", "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>", + "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", + "<65a1sf31sagd684dfv31@mail.gmail.com>", "" + ), + thread = c(rep(NA, 6), "", "", "", "", + "", "", "", "", "", + "", "", "", "", "", + "", "") + ) + + net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + compare.networks(net.expected, net.combined) + +}) + +test_that("Construction of the multi-artifact bipartite network with artifact relations 'issue' and 'mail'", { + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf.issue = NetworkConf$new() + net.conf.issue$update.values(updated.values = list(author.relation = "issue", artifact.relation = "issue")) + net.conf.mail = NetworkConf$new() + net.conf.mail$update.values(updated.values = list(author.relation = "mail", artifact.relation = "mail")) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder.issue = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.issue) + network.builder.mail = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.mail) + + ## build a multi-artifact network by merging two different artifact networks + net.issue = network.builder.issue$get.bipartite.network() + net.mail = network.builder.mail$get.bipartite.network() + + net.combined = merge.networks(list(net.issue, net.mail)) + + ## build expected network + vertices = data.frame(name = c("Björn", "Karl", "Max", "Olaf", "Thomas", + "", "", "", + "", "", "", "", + "Fritz fritz@example.org", "georg", "Hans", "udo", "", + "", "", "", "", "", + "", "", ""), + kind = c(rep("Author", 5), rep("Issue", 7), rep("Author", 4), rep("MailThread", 9)), + type = c(rep("Author", 5), rep("Artifact", 7), rep("Author", 4), rep("Artifact", 9)) + ) + row.names(vertices) = c("Björn", "Karl", "Max", "Olaf", "Thomas", + "", "", "", + "", "", "", "", + "Fritz fritz@example.org", "georg", "Hans", "udo", "", + "", "", "", "", "", + "", "", "") + + edges = data.frame( + from = c("Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", + "Björn", "Björn", "Karl", "Max", "Max", "Max", "Olaf", "Olaf", "Olaf", "Olaf", + "Olaf", "Olaf", "Thomas", "Thomas", "Thomas", "Björn", "Björn", "Björn", + "Fritz fritz@example.org", "georg", "Hans", "Hans", "Hans", "Hans", "Hans", + "Hans", "Hans", "Olaf", "Olaf", "Thomas", "udo"), + to = c("", "", "", + "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", "", "", + "", "", "", "", "", "", + "", "", "", "", "", ""), + date = get.date.from.string(c("2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", + "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", + "2016-07-12 14:59:25", "2016-07-12 16:02:30", "2016-07-12 16:06:01", + "2016-07-15 19:55:39", "2017-05-23 12:32:39", "2016-07-12 15:59:59", + "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", + "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", + "2013-06-01 06:50:26", "2016-07-12 16:01:01", "2016-07-12 16:02:02", + "2013-04-21 23:52:09", "2016-07-12 15:59:25", "2016-07-12 16:03:59", + "2004-10-09 18:38:13", "2005-02-09 18:49:49", "2016-07-12 15:58:40", + "2010-07-12 11:05:35", "2010-07-12 12:05:34", "2010-07-12 12:05:40", + "2010-07-12 12:05:41", "2010-07-12 12:05:42", "2010-07-12 12:05:43", + "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2010-07-12 12:05:46", + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:04:40", + "2010-07-12 10:05:36")), + artifact.type = c(rep("IssueEvent", 24), rep("Mail", 16)), + issue.id = c("", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", rep(NA, 16)), + event.name = c(rep("commented", 24), rep(NA, 16)), + weight = rep(1, 40), + type = rep("Bipartite", 40), + relation = c(rep("issue", 24), rep("mail", 16)), + message.id = c(rep(NA, 24), + "", "<1107974989.17910.6.camel@jmcmullan>", + "<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", + "", "", + "", "", "", + "", "", "", + "", "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>", + "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", + "<65a1sf31sagd684dfv31@mail.gmail.com>", "" + ), + thread = c(rep(NA, 24), "", "", "", "", "", + "", "", "", "", "", "", + "", "", "", "", "") + ) + + net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + compare.networks(net.expected, net.combined) + +}) + +test_that("Construction of the multi-artifact bipartite network with artifact relations 'cochange', 'issue', and 'mail'", { + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf.cochange = NetworkConf$new() + net.conf.cochange$update.values(updated.values = list(author.relation = "cochange", artifact.relation = "cochange")) + net.conf.issue = NetworkConf$new() + net.conf.issue$update.values(updated.values = list(author.relation = "issue", artifact.relation = "issue")) + net.conf.mail = NetworkConf$new() + net.conf.mail$update.values(updated.values = list(author.relation = "mail", artifact.relation = "mail")) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder.cochange = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.cochange) + network.builder.issue = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.issue) + network.builder.mail = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf.mail) + + ## build a multi-artifact network by merging two different artifact networks + net.cochange = network.builder.cochange$get.bipartite.network() + net.issue = network.builder.issue$get.bipartite.network() + net.mail = network.builder.mail$get.bipartite.network() + + net.combined = merge.networks(list(net.cochange, net.issue, net.mail)) + + ## build expected network + vertices = data.frame(name = c("Björn", "Karl", "Olaf", "Thomas", "A", "Base_Feature", "foo", + "Max", "", "", + "", "", "", + "", "", "Fritz fritz@example.org", + "georg", "Hans", "udo", "", "", "", + "", "", "", "", "", + ""), + kind = c(rep("Author", 4), rep("Feature", 3), "Author", + rep("Issue", 7), rep("Author", 4), rep("MailThread", 9)), + type = c(rep("Author", 4), rep("Artifact", 3), "Author", + rep("Artifact", 7), rep("Author", 4), rep("Artifact", 9))) + row.names(vertices) = c("Björn", "Karl", "Olaf", "Thomas", "A", "Base_Feature", "foo", + "Max", "", "", + "", "", "", + "", "", "Fritz fritz@example.org", + "georg", "Hans", "udo", "", "", "", + "", "", "", "", "", + "") + + edges = data.frame( + from = c("Björn", "Karl", "Olaf", "Olaf", "Thomas", "Thomas", "Björn", "Björn", + "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", + "Björn", "Karl", "Max", "Max", "Max", "Olaf", "Olaf", "Olaf", "Olaf", + "Olaf", "Olaf", "Thomas", "Thomas", "Thomas", "Björn", "Björn", "Björn", + "Fritz fritz@example.org", "georg", "Hans", "Hans", "Hans", "Hans", "Hans", + "Hans", "Hans", "Olaf", "Olaf", "Thomas", "udo"), + to = c("A", "Base_Feature", "A", "Base_Feature", "Base_Feature", "foo", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", "", + "", "", "", "", "", + "", "", "", "", "", + "", "", "", ""), + date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:00:45", + "2016-07-12 16:05:41", "2016-07-12 16:06:32", "2016-07-12 16:06:32", + "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", + "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", + "2016-07-12 14:59:25", "2016-07-12 16:02:30", "2016-07-12 16:06:01", + "2016-07-15 19:55:39", "2017-05-23 12:32:39", "2016-07-12 15:59:59", + "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", + "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", + "2013-06-01 06:50:26", "2016-07-12 16:01:01", "2016-07-12 16:02:02", + "2013-04-21 23:52:09", "2016-07-12 15:59:25", "2016-07-12 16:03:59", + "2004-10-09 18:38:13", "2005-02-09 18:49:49", "2016-07-12 15:58:40", + "2010-07-12 11:05:35", "2010-07-12 12:05:34", "2010-07-12 12:05:40", + "2010-07-12 12:05:41", "2010-07-12 12:05:42", "2010-07-12 12:05:43", + "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2010-07-12 12:05:46", + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:04:40", + "2010-07-12 10:05:36")), + artifact.type = c(rep("Feature", 6), rep("IssueEvent", 24), rep("Mail", 16)), + hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "1143db502761379c2bfcecc2007fc34282e7ee61", + "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", + "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", + rep(NA, 40)), + file = c("test.c", "test3.c", "test.c", "test2.c", "test2.c", "test2.c", rep(NA, 40)), + artifact = c("A", "Base_Feature", "A", "Base_Feature", "Base_Feature", "foo", rep(NA, 40)), + weight = rep(1, 46), + type = rep("Bipartite", 46), + relation = c(rep("cochange", 6), rep("issue", 24), rep("mail", 16)), + issue.id = c(rep(NA, 6), "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", "", "", + "", rep(NA, 16)), + event.name = c(rep(NA, 6), rep("commented", 24), rep(NA, 16)), + message.id = c(rep(NA, 30), "", + "<1107974989.17910.6.camel@jmcmullan>", "<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", + "", "", + "", "", "", + "", "", "", + "", "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>", + "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", + "<65a1sf31sagd684dfv31@mail.gmail.com>", ""), + thread = c(rep(NA, 30), "", "", "", "", + "", "", "", "", "", + "", "", "", "", "", + "", "") + ) + + net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - built.edges = igraph::as_data_frame(network.built, what = "edges") - built.vertices = igraph::as_data_frame(network.built, what = "vertices") + compare.networks(net.expected, net.combined) - expect_identical(expected.edges, built.edges, info = "Multi network edges") - expect_identical(expected.vertices, built.vertices, info = "Multi network vertices") - ## TODO as soon as the bug in igraph is fixed switch to the expect_true function below - # expect_true(igraph::identical_graphs(network.expected, network.built)) }) diff --git a/tests/test-networks-multi.R b/tests/test-networks-multi.R index e6bf3925..9ce03817 100644 --- a/tests/test-networks-multi.R +++ b/tests/test-networks-multi.R @@ -14,6 +14,7 @@ ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -91,14 +92,6 @@ test_that("Construction of the multi network for the feature artifact with autho network.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - expected.edges = igraph::as_data_frame(network.expected, what = "edges") - expected.vertices = igraph::as_data_frame(network.expected, what = "vertices") - - built.edges = igraph::as_data_frame(network.built, what = "edges") - built.vertices = igraph::as_data_frame(network.built, what = "vertices") - - expect_identical(expected.edges, built.edges, info = "Multi network edges") - expect_identical(expected.vertices, built.vertices, info = "Multi network vertices") - ## TODO as soon as the bug in igraph is fixed switch to the expect_true function below - # expect_true(igraph::identical_graphs(network.expected, network.built)) + compare.networks(network.expected, network.built) }) + diff --git a/tests/test-networks.R b/tests/test-networks.R index 7722dec3..24a1a098 100644 --- a/tests/test-networks.R +++ b/tests/test-networks.R @@ -130,7 +130,7 @@ test_that("Extraction of sub-networks", { ## ## construct original bipartite network - bip.net.built = extract.bipartite.network.from.network(base.net) + bip.net.built = extract.bipartite.network.from.network(base.net, remove.isolates = TRUE) ## construct expected bipartite network (by removing unipartite edges and isolate vertices) bip.net.expected = igraph::delete.edges(base.net, igraph::E(base.net)[1:9]) @@ -169,7 +169,7 @@ test_that("Extraction of sub-networks", { info = "extract bipartite network from edgeless network") ## the extracted network should be empty then (but with all attributes!) expect_true(igraph::identical_graphs( - extract.bipartite.network.from.network(edgeless.net), + extract.bipartite.network.from.network(edgeless.net, remove.isolates = TRUE), base.net - igraph::vertices(seq_len(igraph::vcount(base.net))) ), info = "extracted bipartite network is empty for edgeless base network") }) diff --git a/tests/test-read.R b/tests/test-read.R index cf627a56..48cae572 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -20,6 +20,7 @@ ## Copyright 2020-2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -216,7 +217,8 @@ test_that("Read the mail data.", { "=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= 2", "Re: busybox 1", "=?KOI8-R?Q?=EF=D4=D7=C5=D4:_Some_patches?= tab", "Re: Fw: busybox 2 tab", "Re: Fw: busybox 10"), - thread = sprintf("", c(1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 7, 8, 8, 8, 9, 9)), + thread = sprintf("", c("13#1", "42#2", "13#3", "42#4", "42#5", "42#6", "42#6", "42#6", + "42#6", "42#6", "42#6", "42#7", "13#8", "13#8", "13#8", "13#9", "13#9")), artifact.type = "Mail" ) ## delete the line with the empty date @@ -259,7 +261,7 @@ test_that("Read and parse the gender data.", { ## read the actual data gender.data.read = read.gender(proj.conf$get.value("datapath.gender")) - + ## build the expected data.frame gender.data.expected = data.frame(author.name = c("Björn", "Fritz fritz@example.org", "georg", "Hans", "Karl", "Max", "Olaf", "Thomas", "udo"), gender = c("male", NA, "male", "male", "male", "male", "female", "male", "female")) @@ -287,6 +289,26 @@ test_that("Read the raw bot data.", { expect_identical(bot.data.read, bot.data.expected, info = "Bot data.") }) +test_that("Read custom event timestamps.", { + + ## configuration object for the datapath + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("custom.event.timestamps.file", "custom-events.list") + + ## read the actual data + timestamps = read.custom.event.timestamps(proj.conf$get.value("datapath"), proj.conf$get.value("custom.event.timestamps.file")) + + timestamps.expected = list( + "Test event 1" = "2016-07-12 15:00:00", + "Test event 2" = "2016-07-12 16:00:00", + "Test event 3" = "2016-07-12 16:05:00", + "Test event 4" = "2016-08-08", + "Test event 5" = "2016-10-05 09:00:00" + ) + + expect_identical(timestamps, timestamps.expected, "Custom timestamps.") +}) + test_that("Read and parse the pasta data.", { ## configuration object for the datapath proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) diff --git a/tests/test-split-sliding-window.R b/tests/test-split-data-activity-based.R similarity index 68% rename from tests/test-split-sliding-window.R rename to tests/test-split-data-activity-based.R index 40813b64..c654446f 100644 --- a/tests/test-split-sliding-window.R +++ b/tests/test-split-data-activity-based.R @@ -12,18 +12,16 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2017-2019 by Claus Hunsen -## Copyright 2017 by Felix Prasse -## Copyright 2018 by Thomas Bock ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Christian Hechtl ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. - -context("Splitting functionality, using sliding windows.") +context("Splitting functionality, activity-based splitting of data.") ## ## Context @@ -38,30 +36,23 @@ ARTIFACT = "feature" if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") -## -## NOTE -## - -## In this test file, we rather test the raw data contents of the data objects -## instead of the networks that can be constructed from these data items! - - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split data -------------------------------------------------------------- -## * time-based ------------------------------------------------------------ +## * activity-based -------------------------------------------------------- -## * * time period --------------------------------------------------------- +## * * activity amount ----------------------------------------------------- ## -## Tests for split.data.time.based(..., split.basis = 'commits'), using sliding windows +## Tests for split.data.activity.based(..., activity.amount = ..., activity.type = 'commits') ## -test_that("Split a data object time-based (split.basis = 'commits', sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.amount = ..., activity.type = 'commits').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -76,19 +67,18 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind ) ## split data - results = split.data.time.based(project.data, time.period = "3 min", - split.basis = "commits", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "commits", sliding.window = FALSE) ## check time ranges expected = c( - "2016-07-12 15:58:59-2016-07-12 16:01:59", - "2016-07-12 16:00:29-2016-07-12 16:03:29", - "2016-07-12 16:01:59-2016-07-12 16:04:59", - "2016-07-12 16:03:29-2016-07-12 16:06:29", - "2016-07-12 16:04:59-2016-07-12 16:06:33" + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:06:10-2016-07-12 16:06:32", + "2016-07-12 16:06:32-2016-07-12 16:06:33" ) lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (activity.amount).") }) ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. @@ -97,13 +87,11 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind ## test that the config contains the correct splitting information expected.config = list( - split.type = "time-based", - split.length = "3 min", + split.type = "activity-based", + split.length = 3, split.basis = "commits", - split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", - "2016-07-12 16:03:29", "2016-07-12 16:04:59", "2016-07-12 16:06:29", - "2016-07-12 16:06:33"), + split.sliding.window = FALSE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -115,46 +103,34 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commits[2, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commits[3:5, ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:8, ] ), commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commit.messages, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commit.messages, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15, 40, 47:49), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29, 47:49), ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(29,41,45,46), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45,46), ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[rownames(data$mails) %in% c(16, 17), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # when pasta is not configured: rownames(data$mails) %in% 16:17 + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$pasta, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$pasta, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, + "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( @@ -165,50 +141,27 @@ test_that("Split a data object time-based (split.basis = 'commits', sliding.wind pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges.") - -}) - - -## -## Tests for split.data.time.based(..., split.basis = 'mails'), using sliding windows -## -test_that("Split a data object time-based (split.basis = 'mails', sliding.window = TRUE).", { + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) + ## + ## split by too-large activity amount + ## ## split data - results = split.data.time.based(project.data, time.period = "3 years", - split.basis = "mails", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, + activity.type = "commits", sliding.window = FALSE) ## check time ranges expected = c( - "2004-10-09 18:38:13-2007-10-10 12:38:13", - "2006-04-10 15:38:13-2009-04-10 09:38:13", - "2007-10-10 12:38:13-2010-10-10 06:38:13", - "2009-04-10 09:38:13-2012-04-10 03:38:13", - "2010-10-10 06:38:13-2013-10-10 00:38:13", - "2012-04-10 03:38:13-2015-04-10 21:38:13", - "2013-10-10 00:38:13-2016-07-12 16:05:38" + "2016-07-12 15:58:59-2016-07-12 16:06:33" ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (too-large activity amount).") }) ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. @@ -217,13 +170,11 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window ## test that the config contains the correct splitting information expected.config = list( - split.type = "time-based", - split.length = "3 years", - split.basis = "mails", - split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", - "2009-04-10 09:38:13", "2010-10-10 06:38:13", "2012-04-10 03:38:13", - "2013-10-10 00:38:13", "2015-04-10 21:38:13", "2016-07-12 16:05:38"), + split.type = "activity-based", + split.length = 18, + split.basis = "commits", + split.sliding.window = FALSE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -232,62 +183,25 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window expect_equal(expected.config, actual) }) - ## check data for all ranges expected.data = list( commits = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commits[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commits[0, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commits[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits ), commit.messages = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commit.messages, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commit.messages, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commit.messages, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$issues[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] ), mails = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 ), pasta = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$pasta, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$pasta, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$pasta, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, - "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, - "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, - "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( @@ -298,20 +212,27 @@ test_that("Split a data object time-based (split.basis = 'mails', sliding.window pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) ## -## Tests for split.data.time.based(..., split.basis = 'issues'), using sliding windows +## Tests for split.data.activity.based(..., activity.amount = ..., activity.type = 'mails') ## -test_that("Split a data object time-based (split.basis = 'issues', sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.amount = ..., activity.type = 'mails').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -326,17 +247,18 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo ) ## split data - results = split.data.time.based(project.data, time.period = "2 years", - split.basis = "issues", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = 3, + activity.type = "mails", sliding.window = FALSE) ## check time ranges expected = c( - "2013-04-21 23:52:09-2015-04-22 11:52:09", - "2014-04-22 05:52:09-2016-04-21 17:52:09", - "2015-04-22 11:52:09-2017-04-21 23:52:09", - "2016-04-21 17:52:09-2017-05-23 12:32:40" + "2004-10-09 18:38:13-2010-07-12 11:05:35", + "2010-07-12 11:05:35-2010-07-12 12:05:41", + "2010-07-12 12:05:41-2010-07-12 12:05:44", + "2010-07-12 12:05:44-2016-07-12 15:58:40", + "2016-07-12 15:58:40-2016-07-12 16:05:37", + "2016-07-12 16:05:37-2016-07-12 16:05:38" ) - lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") }) @@ -347,12 +269,13 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo ## test that the config contains the correct splitting information expected.config = list( - split.type = "time-based", - split.length = "2 years", - split.basis = "issues", - split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", - "2016-04-21 17:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), + split.type = "activity-based", + split.length = 3, + split.basis = "mails", + split.sliding.window = FALSE, + split.revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", + "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", + "2016-07-12 16:05:38"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -364,40 +287,53 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commits[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commits + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commits[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commits[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commits[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] ), commit.messages = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commit.messages, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commit.messages + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commit.messages, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commit.messages, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:36, 37:49), ] + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] ), mails = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[1:3, ], # rownames(data$mails) %in% 1:3 + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[4:6, ], # rownames(data$mails) %in% 4:6 + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[7:9, ], # rownames(data$mails) %in% 7:9 + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[10:12, ], # rownames(data$mails) %in% 10:12 + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[13:15, ], # rownames(data$mails) %in% 14:16 + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[16, ] # rownames(data$mails) %in% 17 ), pasta = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$pasta, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$pasta + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$pasta, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$pasta, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$pasta ), synchronicity = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, - "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, + "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( @@ -408,46 +344,27 @@ test_that("Split a data object time-based (split.basis = 'issues', sliding.windo pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) -## * * bins ---------------------------------------------------------------- - -## -## Tests for split.data.time.based(..., bins = ...), sliding windows parameter ignored -## - -test_that("Split a data object time-based (bins = ... , sliding.window = TRUE).", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() + expect_equal(results.data, expected.data, info = "Data for ranges.") - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) + ## + ## split by too-large activity amount + ## ## split data - results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", - "2017-06-03 03:03:03"), - split.basis = "mails", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = nrow(data$mails) + 10, + activity.type = "mails", sliding.window = FALSE) ## check time ranges expected = c( - "2016-01-01 00:00:00-2016-12-31 23:59:59", - "2016-12-31 23:59:59-2017-06-03 03:03:03" + "2004-10-09 18:38:13-2016-07-12 16:05:38" ) lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (too-large activity amount).") }) ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. @@ -456,11 +373,11 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." ## test that the config contains the correct splitting information expected.config = list( - split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), - split.basis = NULL, + split.type = "activity-based", + split.length = 26, + split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), + split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -472,28 +389,22 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." ## check data for all ranges expected.data = list( commits = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits, - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commits[0, ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] ), commit.messages = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages, - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] ), mails = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails ), pasta = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta, - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$pasta + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta ), synchronicity = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity ) ) results.data = list( @@ -504,20 +415,27 @@ test_that("Split a data object time-based (bins = ... , sliding.window = TRUE)." pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) -## * activity-based -------------------------------------------------------- + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + ## -## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +## Tests for split.data.activity.based(..., activity.amount = ..., activity.type = 'issues') ## -test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.amount = ..., activity.type = 'issues').", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -532,19 +450,19 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ) ## split data - results = split.data.activity.based(project.data, activity.amount = 3, - activity.type = "commits", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = 9, + activity.type = "issues", sliding.window = FALSE) ## check time ranges expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:10", - "2016-07-12 16:00:45-2016-07-12 16:06:20", - "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33" + "2013-04-21 23:52:09-2013-05-25 06:22:23", + "2013-05-25 06:22:23-2016-07-12 15:59:59", + "2016-07-12 15:59:59-2016-07-12 16:06:30", + "2016-07-12 16:06:30-2016-10-05 15:30:02", + "2016-10-05 15:30:02-2017-05-23 12:32:40" ) lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (activity.amount).") + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") }) ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. @@ -554,11 +472,11 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 3, - split.basis = "commits", - split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), + split.length = 9, + split.basis = "issues", + split.sliding.window = FALSE, + split.revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", + "2016-07-12 16:06:30", "2016-10-05 15:30:02", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -570,40 +488,47 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commits[1, ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commit.messages, + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 43:44, 37:38), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] ), mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] + ## comments indicate row names when pasta is not configured + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$pasta, + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity + "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$synchronicity, + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, + "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, + "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -614,19 +539,23 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") ## ## split by too-large activity amount ## ## split data - results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, - activity.type = "commits", sliding.window = TRUE) + results = split.data.activity.based(project.data, activity.amount = nrow(data$issues) + 10, + activity.type = "issues", sliding.window = FALSE) ## check time ranges expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:33" + "2013-04-21 23:52:09-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -640,10 +569,10 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 18, - split.basis = "commits", - split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), + split.length = 59, + split.basis = "issues", + split.sliding.window = FALSE, + split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -655,22 +584,22 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits ), commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues ), mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 14:17 ), pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -681,24 +610,55 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") - ## - ## split by number of windows (i.e., ignoring sliding windows) - ## + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## * * activity amount, sliding windows ------------------------------------ +## +## Tests for split.data.activity.based(..., activity.type = 'commits') using sliding windows +## + +patrick::with_parameters_test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) ## split data - results = split.data.activity.based(project.data, number.windows = 2, + results = split.data.activity.based(project.data, activity.amount = 3, activity.type = "commits", sliding.window = TRUE) ## check time ranges expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:20", + "2016-07-12 15:58:59-2016-07-12 16:06:10", + "2016-07-12 16:00:45-2016-07-12 16:06:20", + "2016-07-12 16:06:10-2016-07-12 16:06:32", "2016-07-12 16:06:20-2016-07-12 16:06:33" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (number.windows).") + info = "Time ranges (activity.amount).") }) ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. @@ -708,10 +668,11 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 4, + split.length = 3, split.basis = "commits", - split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33"), + split.sliding.window = TRUE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -723,27 +684,40 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin ## check data for all ranges expected.data = list( commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + ## comments indicate row names when pasta is not configured + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) @@ -755,28 +729,93 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - ## too large number of windows (i.e., ignoring sliding windows) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) - expect_error( - split.data.activity.based(project.data, activity.type = "commits", - number.windows = nrow(project.data$get.commits.unfiltered()) + 10, sliding.window = TRUE), - info = "Error expected (number.windows) (1)." + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") + + ## + ## split by too-large activity amount + ## + + ## split data + results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, + activity.type = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:06:33" ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (too-large activity amount).") + }) - expect_error( - split.data.activity.based(project.data, activity.type = "commits", number.windows = 0, sliding.window = TRUE), - info = "Error expected (number.windows) (2)." + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "activity-based", + split.length = 18, + split.basis = "commits", + split.sliding.window = TRUE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), + split.revision.dates = NULL ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) -}) + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits + ), + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) -test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE), continued.", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.type = 'commits', sliding.window = TRUE), continued.", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -861,8 +900,9 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], + ## comments indicate row names when pasta is not configured + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] @@ -890,20 +930,28 @@ test_that("Split a data object activity-based (activity.type = 'commits', slidin pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") -}) +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) ## ## Tests for split.data.activity.based(..., activity.type = 'mails') using sliding windows ## -test_that("Split a data object activity-based (activity.type = 'mails', sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.type = 'mails', sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -999,16 +1047,17 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ] ), mails = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], - "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[rownames(data$mails) %in% 2:4, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], - "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[rownames(data$mails) %in% 5:7, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], - "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[rownames(data$mails) %in% 8:10, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[rownames(data$mails) %in% c(11:12, 14), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 15:17, ] + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[1:3, ], # rownames(data$mails) %in% 1:3 + "2005-02-09 18:49:49-2010-07-12 12:05:34" = data$mails[2:4, ], # rownames(data$mails) %in% 2:4 + "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[4:6, ], # rownames(data$mails) %in% 4:6 + "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$mails[5:7, ], # rownames(data$mails) %in% 5:7 + "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[7:9, ], # rownames(data$mails) %in% 7:9 + "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$mails[8:10, ], # rownames(data$mails) %in% 8:10 + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[10:12, ], # rownames(data$mails) %in% 10:12 + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$mails[11:13, ], # rownames(data$mails) %in% c(11:12, 14) + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[13:15, ], # rownames(data$mails) %in% 14:16 + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$mails[14:16, ] # rownames(data$mails) %in% 15:17 ), pasta = list( "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, @@ -1043,6 +1092,10 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges.") ## @@ -1110,106 +1163,27 @@ test_that("Split a data object activity-based (activity.type = 'mails', sliding. pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") - - ## - ## split by number of windows (i.e., ignoring sliding windows) - ## - - ## split data - results = split.data.activity.based(project.data, number.windows = 2, - activity.type = "mail", sliding.window = TRUE) - - ## check time ranges - expected = c( - "2004-10-09 18:38:13-2010-07-12 12:05:43", - "2010-07-12 12:05:43-2016-07-12 16:05:38" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (number.windows).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must not modify the original ProjectConf.") - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 8, - split.basis = "mails", - split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) - ## check data for all ranges - expected.data = list( - commits = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] - ), - commit.messages = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commit.messages, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commit.messages - ), - issues = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:49), ] - ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] - ), - pasta = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta - ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - - ## too large number of windows (i.e., ignoring sliding windows) - - expect_error( - split.data.activity.based(project.data, activity.type = "mails", - number.windows = nrow(project.data$get.mails()) + 10, sliding.window = TRUE), - info = "Error expected (number.windows) (1)." - ) - - expect_error( - split.data.activity.based(project.data, activity.type = "mails", number.windows = 0, sliding.window = TRUE), - info = "Error expected (number.windows) (2)." - ) -}) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) ## ## Tests for split.data.activity.based(..., activity.type = 'issues') using sliding windows ## -test_that("Split a data object activity-based (activity.type = 'issues', sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (activity.type = 'issues', sliding.window = TRUE).", { ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() ## data object @@ -1301,12 +1275,13 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] ), mails = list( + ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[rownames(data$mails) %in% 16:17, ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] @@ -1342,6 +1317,10 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges.") ## @@ -1392,7 +1371,7 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues ), mails = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] + "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 14:17 ), pasta = list( "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta @@ -1409,20 +1388,50 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) - ## - ## split by number of windows (i.e., ignoring sliding windows) - ## +## * * window numbers ------------------------------------------------------ + +## +## Tests for split.data.activity.based(..., number.windows = ..., activity.type = 'commits') +## + +patrick::with_parameters_test_that("Split a data object activity-based (number.windows = ..., activity.type = 'commits').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) ## split data results = split.data.activity.based(project.data, number.windows = 2, - activity.type = "issues", sliding.window = TRUE) + activity.type = "commits", + sliding.window = test.sliding.window) ## check time ranges expected = c( - "2013-04-21 23:52:09-2016-07-12 16:02:02", - "2016-07-12 16:02:02-2017-05-23 12:32:40" + "2016-07-12 15:58:59-2016-07-12 16:06:20", + "2016-07-12 16:06:20-2016-07-12 16:06:33" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -1436,10 +1445,10 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding ## test that the config contains the correct splitting information expected.config = list( split.type = "activity-based", - split.length = 21, - split.basis = "issues", + split.length = 4, + split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:02:02", "2017-05-23 12:32:40"), + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33"), split.revision.dates = NULL ) lapply(results, function(res) { @@ -1451,28 +1460,28 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commits[1:2, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commits[3:8, ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] ), commit.messages = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commit.messages, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commit.messages + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28, 37:40, 43:44), ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36, 41:42, 45:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] ), mails = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[15:16, ], # when pasta is not configured: rownames(data$mails) %in% 16:17 + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$pasta, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$pasta + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$synchronicity, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$synchronicity + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity ) ) results.data = list( @@ -1483,225 +1492,289 @@ test_that("Split a data object activity-based (activity.type = 'issues', sliding pasta = lapply(results, function(cf.data) cf.data$get.pasta()), synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - ## too large number of windows (i.e., ignoring sliding windows) + ## too large number of windows expect_error( - split.data.activity.based(project.data, activity.type = "issues", - number.windows = nrow(project.data$get.issues()) + 10, sliding.window = TRUE), + split.data.activity.based(project.data, activity.type = "commits", + number.windows = nrow(project.data$get.commits.unfiltered()) + 10, + sliding.window = test.sliding.window), info = "Error expected (number.windows) (1)." ) expect_error( - split.data.activity.based(project.data, activity.type = "issues", number.windows = 0, sliding.window = TRUE), + split.data.activity.based(project.data, activity.type = "commits", number.windows = 0, + sliding.window = test.sliding.window), info = "Error expected (number.windows) (2)." ) -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Split network ----------------------------------------------------------- -## * time-based ------------------------------------------------------------ +}, cases.cross.product( + patrick::cases( + ## The sliding window parameter should be ignored. + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) -## * * time period --------------------------------------------------------- ## -## Tests for split.network.time.based(..., time.period = ...) using sliding windows +## Tests for split.data.activity.based(..., number.windows = ..., activity.type = 'mails') ## -test_that("Split a network time-based (time.period = ... , sliding.window = TRUE).", { - - ## time period - time.period = "2 mins" +patrick::with_parameters_test_that("Split a data object activity-based (number.windows = ..., activity.type = 'mails').", { - ## configuration and data objects + ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - ## - ## simplify = FALSE - ## + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) - ## retrieve author network - author.net = net.builder$get.author.network() + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "mail", + sliding.window = test.sliding.window) - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), - "2016-07-12 15:59:59-2016-07-12 16:01:59" = igraph::subgraph.edges(author.net, c(2)), - "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:01:59-2016-07-12 16:03:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:03:59-2016-07-12 16:05:59" = igraph::subgraph.edges(author.net, c(3,5)), - "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2010-07-12 12:05:43", + "2010-07-12 12:05:43-2016-07-12 16:05:38" ) - results = split.network.time.based(author.net, time.period = "2 mins", sliding.window = TRUE) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (number.windows).") + }) - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges.") + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "activity-based", + split.length = 8, + split.basis = "mails", + split.sliding.window = FALSE, + split.revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) }) - expect_true(all(check.identical), info = "Network equality.") - ## - ## simplify = TRUE - ## + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + commit.messages = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commit.messages, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commit.messages + ), + issues = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[1:8, ], # rownames(data$mails) %in% 1:8 + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[9:16, ] # rownasem(data$mails) %in% 9:17 + ), + pasta = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) - ## update network configuration - net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) - net.builder$reset.environment() + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) - ## retrieve author network - author.net = net.builder$get.author.network() + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - expect_error(split.network.time.based(author.net, bins = bins, sliding.window = TRUE), info = "Illegal split.") + ## too large number of windows + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", + number.windows = nrow(project.data$get.mails()) + 10, + sliding.window = test.sliding.window), + info = "Error expected (number.windows) (1)." + ) + + expect_error( + split.data.activity.based(project.data, activity.type = "mails", number.windows = 0, + sliding.window = test.sliding.window), + info = "Error expected (number.windows) (2)." + ) -}) +}, cases.cross.product( + patrick::cases( + ## The sliding window parameter should be ignored. + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) -## * activity-based ------------------------------------------------------------ ## -## Tests for split.network.activity.based(...) using sliding windows +## Tests for split.data.activity.based(..., number.windows = ..., activity.type = 'issues') ## -test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE).", { +patrick::with_parameters_test_that("Split a data object activity-based (number.windows = ..., activity.type = 'issues').", { - ## configuration and data objects + ## configuration objects proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - ## retrieve author network - author.net = net.builder$get.author.network() - - ## - ## number.edges (1) - ## - - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), - "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), - "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), - "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), - "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), - "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), - "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() ) - results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## split data + results = split.data.activity.based(project.data, number.windows = 2, + activity.type = "issues", + sliding.window = test.sliding.window) - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2016-07-12 16:02:02", + "2016-07-12 16:02:02-2017-05-23 12:32:40" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, + info = "Time ranges (number.windows).") }) - expect_true(all(check.identical), info = "Network equality (number.edges (1)).") - ## - ## number.edges (2) - ## + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "activity-based", + split.length = 21, + split.basis = "issues", + split.sliding.window = FALSE, + split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:02:02", "2017-05-23 12:32:40"), + split.revision.dates = NULL ) - results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10, - sliding.window = TRUE) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) }) - expect_true(all(check.identical), info = "Network equality (number.edges (2)).") - - ## - ## number.windows (1) (i.e., ignoring sliding windows) - ## - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), - "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), - "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commits[1:2, ], + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commits[3:8, ] + ), + commit.messages = list( + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commit.messages, + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commit.messages + ), + issues = list( + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28, 37:40, 43:44), ], + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36, 41:42, 45:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$mails[15:16, ] # rownames(data$maisl) %in% 16:17 + ), + pasta = list( + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$pasta, + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$synchronicity, + "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) ) - results = split.network.activity.based(author.net, number.windows = 3, sliding.window = TRUE) - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality (number.windows (1)).") + expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - ## - ## number.windows (2) (i.e., ignoring sliding windows) - ## + ## too large number of windows expect_error( - split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10, - sliding.window = TRUE), - info = "Error expected (number.windows (2))." + split.data.activity.based(project.data, activity.type = "issues", + number.windows = nrow(project.data$get.issues()) + 10, + sliding.window = test.sliding.window), + info = "Error expected (number.windows) (1)." ) -}) - -test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE), continued.", { - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## retrieve author network and add an additional edge in the end - author.net = net.builder$get.author.network() - author.net = igraph::add_edges(author.net, c("Olaf", "Thomas"), - attr = list(date = get.date.from.string("2020-02-20 20:20:20"))) - - ## - ## number.edges (1) - ## - - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), - "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), - "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), - "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), - "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), - "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), - "2016-07-12 16:06:32-2020-02-20 20:20:20" = igraph::subgraph.edges(author.net, c(6, 8)), - "2016-07-12 16:06:32-2020-02-20 20:20:21" = igraph::subgraph.edges(author.net, c(8, 9)) - ) - results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + expect_error( + split.data.activity.based(project.data, activity.type = "issues", number.windows = 0, + sliding.window = test.sliding.window), + info = "Error expected (number.windows) (2)." + ) -}) +}, cases.cross.product( + patrick::cases( + ## The sliding window parameter should be ignored. + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R new file mode 100644 index 00000000..3f28a790 --- /dev/null +++ b/tests/test-split-data-time-based.R @@ -0,0 +1,1538 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2017 by Felix Prasse +## Copyright 2020 by Thomas Bock +## Copyright 2018 by Christian Hechtl +## Copyright 2018 by Jakob Kronawitter +## Copyright 2019 by Anselm Fehnker +## Copyright 2021 by Niklas Schneider +## Copyright 2021 by Johannes Hostert +## Copyright 2022 by Jonathan Baumann +## All Rights Reserved. + +context("Splitting functionality, time-based splitting of data.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split data -------------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.data.time.based(..., split.basis = 'commits') +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'commits').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 min", + split.basis = "commits", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:01:59", + "2016-07-12 16:01:59-2016-07-12 16:04:59", + "2016-07-12 16:04:59-2016-07-12 16:06:33" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "3 min", + split.basis = "commits", + split.sliding.window = FALSE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] + ), + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15,29, 47:49), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45:46), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(jata$mails) == 16 + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] #rownames(data$mails) == 17 + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., split.basis = 'mails') +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'mails').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 years", + split.basis = "mails", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2007-10-10 12:38:13", + "2007-10-10 12:38:13-2010-10-10 06:38:13", + "2010-10-10 06:38:13-2013-10-10 00:38:13", + "2013-10-10 00:38:13-2016-07-12 16:05:38" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "3 years", + split.basis = "mails", + split.sliding.window = FALSE, + split.revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", + "2013-10-10 00:38:13", "2016-07-12 16:05:38"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + commit.messages = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages + ), + issues = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + ), + pasta = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., split.basis = 'issues') +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'issues').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "2 years", + split.basis = "issues", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2015-04-22 11:52:09", + "2015-04-22 11:52:09-2017-04-21 23:52:09", + "2017-04-21 23:52:09-2017-05-23 12:32:40" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "2 years", + split.basis = "issues", + split.sliding.window = FALSE, + split.revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commits[0, ] + ), + commit.messages = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages + ), + issues = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 35:36, ] + ), + mails = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # when pasta is not configured: rownames(data$mails) %in% 14:17 + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] + ), + pasta = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * time period, sliding windows ---------------------------------------- + +## +## Tests for split.data.time.based(..., split.basis = 'commits'), using sliding windows +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'commits', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 min", + split.basis = "commits", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:01:59", + "2016-07-12 16:00:29-2016-07-12 16:03:29", + "2016-07-12 16:01:59-2016-07-12 16:04:59", + "2016-07-12 16:03:29-2016-07-12 16:06:29", + "2016-07-12 16:04:59-2016-07-12 16:06:33" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "3 min", + split.basis = "commits", + split.sliding.window = TRUE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", + "2016-07-12 16:03:29", "2016-07-12 16:04:59", "2016-07-12 16:06:29", + "2016-07-12 16:06:33"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commits[2, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commits[3:5, ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] + ), + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$commit.messages, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$commit.messages, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15, 40, 47:49), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29, 47:49), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(29,41,45,46), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45,46), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(data$mails) == 16 + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[15:16, ], # rownames(data$mails) %in% c(16,17) + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] # rownames(data$mails) == 17 + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$pasta, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$pasta, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$synchronicity, + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$synchronicity, + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., split.basis = 'mails'), using sliding windows +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'mails', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "3 years", + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2007-10-10 12:38:13", + "2006-04-10 15:38:13-2009-04-10 09:38:13", + "2007-10-10 12:38:13-2010-10-10 06:38:13", + "2009-04-10 09:38:13-2012-04-10 03:38:13", + "2010-10-10 06:38:13-2013-10-10 00:38:13", + "2012-04-10 03:38:13-2015-04-10 21:38:13", + "2013-10-10 00:38:13-2016-07-12 16:05:38" + ) + + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "3 years", + split.basis = "mails", + split.sliding.window = TRUE, + split.revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", + "2009-04-10 09:38:13", "2010-10-10 06:38:13", "2012-04-10 03:38:13", + "2013-10-10 00:38:13", "2015-04-10 21:38:13", "2016-07-12 16:05:38"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commits[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commits[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commits[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + commit.messages = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$commit.messages, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$commit.messages, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$commit.messages, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages + ), + issues = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$issues[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + ), + pasta = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$pasta, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$pasta, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$pasta, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, + "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$synchronicity, + "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, + "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$synchronicity, + "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, + "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$synchronicity, + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., split.basis = 'issues'), using sliding windows +## + +patrick::with_parameters_test_that("Split a data object time-based (split.basis = 'issues', sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, time.period = "2 years", + split.basis = "issues", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2015-04-22 11:52:09", + "2014-04-22 05:52:09-2016-04-21 17:52:09", + "2015-04-22 11:52:09-2017-04-21 23:52:09", + "2016-04-21 17:52:09-2017-05-23 12:32:40" + ) + + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "2 years", + split.basis = "issues", + split.sliding.window = TRUE, + split.revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", + "2016-04-21 17:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commits[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commits + ), + commit.messages = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$commit.messages, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$commit.messages + ), + issues = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:36, 37:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # rownames(data$mails) %in% 14:17 + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # rownames(data$mails) %in% 14:17 + ), + pasta = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$pasta, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, + "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$synchronicity, + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * bins ---------------------------------------------------------------- + +## +## Tests for split.data.time.based(..., bins = ...) +## + +patrick::with_parameters_test_that("Split a data object time-based (bins = ... ).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), + split.basis = "mails", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2016-01-01 00:00:00-2016-12-31 23:59:59" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), + split.basis = NULL, + split.sliding.window = FALSE, + split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits + ), + commit.messages = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages + ), + issues = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ] + ), + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] + ), + pasta = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## +## Tests for split.data.time.based(..., bins = ...), sliding windows parameter ignored +## + +patrick::with_parameters_test_that("Split a data object time-based (bins = ... , sliding.window = TRUE).", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", + "2017-06-03 03:03:03"), + split.basis = "mails", sliding.window = TRUE) + + ## check time ranges + expected = c( + "2016-01-01 00:00:00-2016-12-31 23:59:59", + "2016-12-31 23:59:59-2017-06-03 03:03:03" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), + split.basis = NULL, + split.sliding.window = FALSE, + split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commits[0, ] + ), + commit.messages = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages + ), + issues = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + ), + mails = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$mails[0, ] + ), + pasta = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$pasta + ), + synchronicity = list( + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity, + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * custom event timestamps ---------------------------------------------------------------- + +## +## Tests for split.data.time.based.by.timestamps +## + +patrick::with_parameters_test_that("Split a data object time-based using custom event timestamps.", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + proj.conf$update.value("custom.event.timestamps.file", "custom-events.list") + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data by custom event timestamps + results = split.data.time.based.by.timestamps(project.data) + + ## check time ranges + expected = c( + "2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 16:00:00-2016-07-12 16:05:00", + "2016-07-12 16:05:00-2016-08-08 00:00:00", + "2016-08-08 00:00:00-2016-10-05 09:00:00" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", + "2016-08-08 00:00:00", "2016-10-05 09:00:00"), + split.basis = NULL, + split.sliding.window = FALSE, + split.revisions = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", + "2016-08-08 00:00:00", "2016-10-05 09:00:00"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$commits[1, ], + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$commits[2, ], + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$commits[3:8, ], + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$commits[0, ] + ), + commit.messages = list( + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$commit.messages, + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$commit.messages, + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$commit.messages, + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$commit.messages + ), + issues = list( + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(20:22, 27, 28, 37:39), ], + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 40, 45:49), ], + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(16:19, 23:24, 41, 42), ], + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(25, 30), ] + ), + mails = list( + ## comments indicate rownames when pasta is not configured + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$mails[15, ], # rownames(data$mails) %in% 16 + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$mails[16, ], # rownames(data$mails) %in% 17 + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$mails[0, ] + ), + pasta = list( + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$pasta, + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$pasta, + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$pasta, + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$synchronicity, + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$synchronicity, + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$synchronicity, + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * ranges -------------------------------------------------------------- + +## +## Test splitting data by network names. +## + +patrick::with_parameters_test_that("Test splitting data by networks", { + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## construct project data + project.data = ProjectData$new(proj.conf) + + ## split data + mybins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", + "2016-07-12 16:05:00", "2016-10-05 09:00:00")) + input.data = split.data.time.based(project.data, bins = mybins) + input.data.network = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) + + ## split data by networks + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete") + results = lapply(aggregation.level, function(level) + split.data.by.networks(input.data.network, project.data, level) + ) + names(results) = aggregation.level + + ## construct expected ranges + expected.ranges = list( + range = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 16:00:00-2016-07-12 16:05:00", + "2016-07-12 16:05:00-2016-10-05 09:00:00"), + cumulative = c("2016-07-12 15:00:00-2016-07-12 16:00:00", + "2016-07-12 15:00:00-2016-07-12 16:05:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + all.ranges = c("2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00", + "2016-07-12 15:00:00-2016-10-05 09:00:00"), + project.cumulative = c("2004-10-09 18:38:13-2016-07-12 16:00:00", + "2004-10-09 18:38:13-2016-07-12 16:05:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + project.all.ranges = c("2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00", + "2004-10-09 18:38:13-2016-10-05 09:00:00"), + complete = c("2004-10-09 18:38:13-2017-05-23 12:32:40", + "2004-10-09 18:38:13-2017-05-23 12:32:40", + "2004-10-09 18:38:13-2017-05-23 12:32:40") + ) + + ## test the ranges + test.each.network = function(aggregation.level) { + result.data = results[[aggregation.level]] + expected.range.names = expected.ranges[[aggregation.level]] + + lapply(seq_along(result.data), function(i) { + result.entry = result.data[[i]] + + expect_true(igraph::identical_graphs(result.entry[["network"]], input.data.network[[i]])) + expect_equal(result.entry[["data"]]$get.range(), expected.range.names[[i]]) + }) + } + lapply(aggregation.level, test.each.network) +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## +## Test splitting data by ranges. +## + +patrick::with_parameters_test_that("Test splitting data by ranges", { + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## construct project data + project.data = ProjectData$new(proj.conf) + + ## split data + my.bins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", + "2016-07-12 16:05:00", "2016-10-05 09:00:00")) + my.ranges = construct.ranges(my.bins, sliding.window = FALSE) + expected.results = split.data.time.based(project.data, bins = my.bins) + results = split.data.time.based.by.ranges(project.data, my.ranges) + + ## check time ranges + expect_equal(names(results), my.ranges, info = "Time ranges.") + + ## check data for all ranges + expected.data = list( + commits = lapply(expected.results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(expected.results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(expected.results, function(cf.data) cf.data$get.issues()), + mails = lapply(expected.results, function(cf.data) cf.data$get.mails()), + pasta = lapply(expected.results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(expected.results, function(cf.data) cf.data$get.synchronicity()) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * window numbers --------------------------------------------------------- + +## +## Tests for split.data.time.based(..., number.windows = ..., split.basis = 'commits') +## + +patrick::with_parameters_test_that("Split a data object time-based with equal-sized windows (number.windows = ..., split.basis = 'commits').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, number.windows = 3, + split.basis = "commits", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2016-07-12 15:58:59-2016-07-12 16:01:30", + "2016-07-12 16:01:30-2016-07-12 16:04:01", + "2016-07-12 16:04:01-2016-07-12 16:06:33" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + + expected.config = list( + split.type = "time-based", + split.length = "2M 31S", + split.basis = "commits", + split.sliding.window = FALSE, + split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:30", "2016-07-12 16:04:01", "2016-07-12 16:06:33"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$commits[1:2, ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$commits[0, ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$commits[3:8, ] + ), + commit.messages = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$commit.messages, + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$commit.messages, + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$commit.messages + ), + issues = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(20:22, 37:40), ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 47:49), ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23, 41, 45:46), ] + ), + mails = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$mails[0, ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$mails[0, ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 + ), + pasta = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$pasta, + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$pasta, + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$pasta + ), + synchronicity = list( + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$synchronicity, + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$synchronicity, + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., number.windows = ..., split.basis = 'mails') +## + +patrick::with_parameters_test_that("Split a data object time-based with equal-sized windows (number.windows = ..., split.basis = 'mails').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, number.windows = 4, + split.basis = "mails", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2004-10-09 18:38:13-2007-09-18 06:00:04", + "2007-09-18 06:00:04-2010-08-26 17:21:55", + "2010-08-26 17:21:55-2013-08-04 04:43:46", + "2013-08-04 04:43:46-2016-07-12 16:05:38" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "2y 0m 342d 23H 21M 51S", + split.basis = "mails", + split.sliding.window = FALSE, + split.revisions = c("2004-10-09 18:38:13", "2007-09-18 06:00:04", "2010-08-26 17:21:55", + "2013-08-04 04:43:46", "2016-07-12 16:05:38"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$commits[0, ], + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$commits[0, ], + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$commits[0, ], + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$commits[1:2, ] + ), + commit.messages = list( + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$commit.messages, + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$commit.messages, + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$commit.messages, + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$commit.messages + ), + issues = list( + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$issues[0, ], + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$issues[0, ], + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$issues[rownames(data$issues) %in% 1:13, ], + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + ), + mails = list( + ## comments indicate row names when pasta is not configured + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$mails[1:2, ], + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$mails[0, ], + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + ), + pasta = list( + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$pasta, + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$pasta, + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$pasta, + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$pasta + ), + synchronicity = list( + "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$synchronicity, + "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$synchronicity, + "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$synchronicity, + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + + +## +## Tests for split.data.time.based(..., number.windows = ..., split.basis = 'issues') +## + +patrick::with_parameters_test_that("Split a data object time-based with equal-sized windows (number.windows = ..., split.basis = 'issues').", { + + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## split data + results = split.data.time.based(project.data, number.windows = 3, + split.basis = "issues", sliding.window = FALSE) + + ## check time ranges + expected = c( + "2013-04-21 23:52:09-2014-09-01 12:05:39", + "2014-09-01 12:05:39-2016-01-12 00:19:09", + "2016-01-12 00:19:09-2017-05-23 12:32:40" + ) + lapply(results, function(res) { + expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") + }) + + ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. + expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), + info = "Splitting must not modify the original ProjectConf.") + + ## test that the config contains the correct splitting information + expected.config = list( + split.type = "time-based", + split.length = "1y 0m 132d 6H 13M 30S", + split.basis = "issues", + split.sliding.window = FALSE, + split.revisions = c("2013-04-21 23:52:09", "2014-09-01 12:05:39", "2016-01-12 00:19:09", "2017-05-23 12:32:40"), + split.revision.dates = NULL + ) + lapply(results, function(res) { + actual = lapply(names(expected.config), res$get.project.conf()$get.value) + names(actual) = names(expected.config) + expect_equal(expected.config, actual) + }) + + ## check data for all ranges + expected.data = list( + commits = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$commits[0, ], + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$commits[0, ], + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$commits + ), + commit.messages = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$commit.messages, + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$commit.messages, + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$commit.messages + ), + issues = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$issues[rownames(data$issues) %in% 1:13, ], + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$issues[0, ], + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:49, ] + ), + mails = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$mails[0, ], + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$mails[0, ], + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 13:17 + ), + pasta = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$pasta, + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$pasta, + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$pasta + ), + synchronicity = list( + "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$synchronicity, + "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$synchronicity, + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$synchronicity + ) + ) + results.data = list( + commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), + commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), + issues = lapply(results, function(cf.data) cf.data$get.issues()), + mails = lapply(results, function(cf.data) cf.data$get.mails()), + pasta = lapply(results, function(cf.data) cf.data$get.pasta()), + synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) + ) + + expected.data = remove.row.names.from.inner.list.of.dfs(expected.data) + results.data = remove.row.names.from.inner.list.of.dfs(results.data) + + expect_equal(results.data, expected.data, info = "Data for ranges.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) diff --git a/tests/test-split-misc.R b/tests/test-split-misc.R new file mode 100644 index 00000000..c2a9c723 --- /dev/null +++ b/tests/test-split-misc.R @@ -0,0 +1,412 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2018 by Jakob Kronawitter +## Copyright 2022 by Jonathan Baumann +## All Rights Reserved. + + +context("Splitting functionality.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## +## TODO +## + +## - net.conf$update.values(list(pasta = TRUE, synchronicity = TRUE)) + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split raw data (data and networks by bins) ------------------------------ + +## +## Tests for split.data.by.bins and split.network.by.bins +## + +test_that("Split network and data on low level (split.data.by.bins, split.network.by.bins).", { + + length.dates = 15 + length.bins = 5 + + ## generate dates + dates = c("2000-01-25", "2000-01-23", "2000-01-15", "2000-01-27", "2000-01-13", + "2000-01-03", "2000-01-05", "2000-01-29", "2000-01-19", "2000-01-01", + "2000-01-11", "2000-01-07", "2000-01-21", "2000-01-09", "2000-01-17") + # ## ## generated with: + # sprintf("c(\"%s\")", paste( + # get.date.string(sample( + # seq.POSIXt(get.date.from.string("2000-01-01"), get.date.from.string("2000-02-01"), by = "1 days"), + # length.dates, + # replace = FALSE + # )), collapse = "\", \"")) + + ## generate bins + bins = seq_len(length.bins) + bins.vector = c("1", "3", "5", "4", "1", "3", "1", "3", "2", "5", "4", "2", "4", "3", "5") + ## ## generated with: + ## sprintf("c(\"%s\")", paste( sample(bins, size = length.dates, replace = TRUE), collapse = "', '") ) + + ## + ## split.data.by.bins + ## + + ## generate data frame with dates and IDs + df = data.frame( + id = 1:length.dates, + date = dates + ) + + ## results + expected = list( + "1" = df[ c(1, 5, 7), ], + "2" = df[ c(9, 12), ], + "3" = df[ c(2, 6, 8, 14), ], + "4" = df[ c(4, 11, 13), ], + "5" = df[ c(3, 10, 15), ] + ) + results = split.data.by.bins(df, bins.vector) + + ## check result + expect_equal(results, expected, info = "Split data by bins.") + + ## + ## split.network.by.bins + ## + + ## generate data frame with dates and IDs + vcount = 4 + net = igraph::make_empty_graph(n = vcount, directed = FALSE) + for (e.id in seq_len(length.dates)) { + net = net + igraph::edge( + sample(seq_len(vcount), 1), # from vertex + sample(seq_len(vcount), 1), # to vertex + date = get.date.from.string(dates[e.id]) + ) + } + + ## results + expected = list( + igraph::subgraph.edges(net, c(1, 5, 7)), + igraph::subgraph.edges(net, c(9, 12)), + igraph::subgraph.edges(net, c(2, 6, 8, 14)), + igraph::subgraph.edges(net, c(4, 11, 13)), + igraph::subgraph.edges(net, c(3, 10, 15)) + ) + results = split.network.by.bins(net, bins, bins.vector) + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Split network by bins (network equality).") + +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Bin identification ------------------------------------------------------ + +## +## Tests for split.get.bins.time.based and split.get.bins.activity.based +## + +test_that("Get bins for network and data on low level (split.get.bins.time.based, split.get.bins.activity.based).", { + + length.dates = 15 + length.bins = 5 + + ## generate dates + dates = c("2000-01-25", "2000-01-23", "2000-01-15", "2000-01-27", "2000-01-13", + "2000-01-03", "2000-01-05", "2000-01-29", "2000-01-19", "2000-01-01", + "2000-01-11", "2000-01-07", "2000-01-21", "2000-01-09", "2000-01-17") + dates.posixct = get.date.from.string(dates) + ## ## generated with: + ## sprintf("c(\"%s\")", paste( + ## get.date.string(sample( + ## seq.POSIXt(get.date.from.string("2000-01-01"), get.date.from.string("2000-02-01"), by = "1 days"), + ## length.dates, + ## replace = FALSE + ## )), collapse = "\", \"")) + + ## + ## split.get.bins.time.based (1) + ## + + ## results + expected.bins = c("2000-01-01 00:00:00", "2000-01-11 00:00:00", "2000-01-21 00:00:00", "2000-01-29 00:00:01") + expected = list( + vector = factor(head(expected.bins, -1))[c(3, 3, 2, 3, 2, + 1, 1, 3, 2, 1, + 2, 1, 3, 1, 2)], + bins = expected.bins + ) + results = split.get.bins.time.based(dates.posixct, "10 days") + + ## check result + expect_equal(results, expected, info = "split.get.bins.time.based (1)") + + ## + ## split.get.bins.time.based (2) + ## + + ## results + expected.bins = c("2000-01-01 00:00:00", "2000-01-29 00:00:01") + expected = list( + vector = factor(head(expected.bins, -1))[ rep(1, length.dates) ], + bins = expected.bins + ) + results = split.get.bins.time.based(dates.posixct, "1 year") + + ## check result + expect_equal(results, expected, info = "split.get.bins.time.based (2)") + + ## + ## split.get.bins.time.based (3) + ## + + ## results + dates.unround = get.date.from.string(c("2004-01-01 00:00:00", "2004-01-01 00:00:14", "2004-01-01 00:00:22")) + expected.bins = c("2004-01-01 00:00:00", "2004-01-01 00:00:05", "2004-01-01 00:00:10", + "2004-01-01 00:00:15", "2004-01-01 00:00:20", "2004-01-01 00:00:23") # adding 4.2 seconds each + expected = list( + vector = factor(head(expected.bins, -1))[ c(1, 3, 5) ], + bins = expected.bins + ) + results = split.get.bins.time.based(dates.unround, number.windows = length.bins) + + ## check result + expect_equal(results, expected, info = "split.get.bins.time.based (3)") + + ## + ## split.get.bins.activity.based (1) + ## + + ## construct data.frame + df = data.frame(date = dates.posixct, id = seq_len(length.dates)) + df = df[ order(df$date), ] + + ## results + expected = list( + vector = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4), + bins = c("2000-01-01 00:00:00", "2000-01-09 00:00:00", "2000-01-17 00:00:00", "2000-01-25 00:00:00", "2000-01-29 00:00:01") + ) + results = split.get.bins.activity.based(df, "id", 4) + + ## check result + expect_equal(results, expected, info = "split.get.bins.activity.based (1)") + + ## + ## split.get.bins.activity.based (2) + ## + + ## construct data.frame + df = data.frame(date = dates.posixct, id = seq_len(length.dates)) + df = df[ order(df$date), ] + + ## results + expected = list( + vector = rep(1, length.out = length.dates), + bins = c("2000-01-01 00:00:00", "2000-01-29 00:00:01") + ) + results = split.get.bins.activity.based(df, "id", nrow(df) + 10) + + ## check result + expect_equal(results, expected, info = "split.get.bins.activity.based (2)") + +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Consistency tests ------------------------------------------------------- + +## +## Tests for consistency of data and network time-based splitting +## + +test_that("Check consistency of data and network time-based splitting.", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + + ## retrieve project data and network builder + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + ## retrieve author network + project.net = net.builder$get.author.network() + + ## set time period for splitting + time.period = "7 mins" + + ## split data + results.data = split.data.time.based(project.data, time.period = time.period, split.basis = "commits") + results.data.network = lapply(results.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) + + ## split network + results.network = split.network.time.based(project.net, time.period = time.period) + + ## check ranges + expect_equal(names(results.network), names(results.data.network), info = "Range equality.") + + ## the chosen time-window size results in the following condition: + ## 1) Thomas and Karl only appear in the second time window, both working on the base feature. + ## 2) Olaf only appears in the first time window, working on the base feature as the only author. + ## Thus, when splitting the project-level network, there are edges from Olaf to Karl and Thomas, + ## crossing the time-window border. Hence, when deleting the respective vertices from the networks, + ## the data-based networks should match the network-based networks. + results.network[[1]] = igraph::delete.vertices(results.network[[1]], c("Thomas", "Karl")) + results.network[[2]] = igraph::delete.vertices(results.network[[2]], c("Olaf")) + check.identical = mapply(results.data.network, results.network, FUN = function(d, n) { + igraph::identical_graphs(d, n) + }) + expect_true(all(check.identical), info = "Network equality.") + +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Unification of range names ---------------------------------------------- + +## +## Tests for duplicate range names +## + +test_that("Check and correct duplicate range names during network activity-based splitting.", { + + ## define dates for edges and the resulting changes + dates = get.date.from.string(c( + "2000-01-01 01:00:00", "2001-01-01 12:00:00", + + "2001-01-01 12:00:00", "2001-01-01 12:00:00", + "2001-01-01 12:00:00", "2001-01-01 12:00:00", + "2001-01-01 12:00:00", "2001-01-01 12:00:00", + + "2002-01-01 12:00:00", "2002-01-01 12:00:00", + "2002-01-01 12:00:00", "2002-01-01 12:00:00", + "2002-01-01 12:00:00", "2002-01-01 12:00:00", + "2002-01-01 12:00:00", "2002-01-01 12:00:00", + + "2002-01-01 12:00:00", "2003-01-01 12:00:00" + )) + expected.ranges = c( + "2000-01-01 01:00:00-2001-01-01 12:00:00", + + "2001-01-01 12:00:00-2001-01-01 12:00:00", + "2001-01-01 12:00:00-2001-01-01 12:00:00", + + "2001-01-01 12:00:00-2002-01-01 12:00:00", + + "2002-01-01 12:00:00-2002-01-01 12:00:00", + "2002-01-01 12:00:00-2002-01-01 12:00:00", + "2002-01-01 12:00:00-2002-01-01 12:00:00", + "2002-01-01 12:00:00-2002-01-01 12:00:00", + + "2002-01-01 12:00:00-2003-01-01 12:00:01" + ) + expected.ranges.corrected = c( + "2000-01-01 01:00:00-2001-01-01 12:00:00", + + "2001-01-01 12:00:00-2001-01-01 12:00:00 (1)", + "2001-01-01 12:00:00-2001-01-01 12:00:00 (2)", + + "2001-01-01 12:00:00-2002-01-01 12:00:00", + + "2002-01-01 12:00:00-2002-01-01 12:00:00 (1)", + "2002-01-01 12:00:00-2002-01-01 12:00:00 (2)", + "2002-01-01 12:00:00-2002-01-01 12:00:00 (3)", + "2002-01-01 12:00:00-2002-01-01 12:00:00 (4)", + + "2002-01-01 12:00:00-2003-01-01 12:00:01" + ) + + ## construct a small network + net = igraph::make_empty_graph(directed = FALSE) + + igraph::vertices(c("A", "B")) + + igraph::edges(rep(c("A", "B"), times = length(dates))) + ## set some date attributes that are appropriate for the test case + net = igraph::set.edge.attribute(net, "date", value = dates) + + ## define split arguments + split.function = split.network.activity.based + split.activity.amount = 2 + split.arguments = list(network = net, number.edges = split.activity.amount, sliding.window = FALSE) + + ## check for issued warning + expect_output( + do.call(split.function, split.arguments), + "WARNING::Due to the splitting, there are duplicated range names.", + fixed = TRUE, + info = "Generate warning." + ) + + ## check range names + net.split = do.call(split.function, split.arguments) + ranges = names(net.split) + expect_equal(ranges, expected.ranges, info = "Ranges (original).") + + ## correct ranges + ranges.corrected = split.unify.range.names(ranges) + expect_equal(ranges.corrected, expected.ranges.corrected, info = "Ranges (unified).") + + + ## Arbitrary range names (1) + ranges = c("A-B", "B-C", "C-D") + expected = c("A-B", "B-C", "C-D") + result = split.unify.range.names(ranges) + expect_identical(result, expected, info = "Arbitrary ranges (1).") + + ## Arbitrary range names (2) + ranges = c("A-B", "A-B", "B-C", "B-C", "C-D") + expected = c("A-B (1)", "A-B (2)", "B-C (1)", "B-C (2)", "C-D") + result = split.unify.range.names(ranges) + expect_identical(result, expected, info = "Arbitrary ranges (2).") + + ## Arbitrary range names (3) + ranges = c("A-B", "A-B", "B-C", "A-B", "B-C") + expected = c("A-B (1)", "A-B (2)", "B-C (1)", "A-B (1)", "B-C (1)") + result = split.unify.range.names(ranges) + expect_identical(result, expected, info = "Arbitrary ranges (3).") + + ## Arbitrary range names (4) + ranges = c("A-B", "A-B", "B-C", "C-D", "C-D") + expected = c("A-B (1)", "A-B (2)", "B-C", "C-D (1)", "C-D (2)") + result = split.unify.range.names(ranges) + expect_identical(result, expected, info = "Arbitrary ranges (4).") + + ## + ## the removal duplicate ranges + ## + + df = data.frame(date = dates, id = 1:length(dates)) + expected = expected.ranges[c(1, 4, 9)] + result = construct.ranges( + split.get.bins.activity.based(df, "id", activity.amount = split.activity.amount, remove.duplicate.bins = TRUE)[["bins"]], + sliding.window = FALSE + ) + expect_identical(result, expected, info = "Removal of duplicate ranges.") + +}) diff --git a/tests/test-split-network-activity-based.R b/tests/test-split-network-activity-based.R new file mode 100644 index 00000000..52d7b8f0 --- /dev/null +++ b/tests/test-split-network-activity-based.R @@ -0,0 +1,280 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2020 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter +## Copyright 2022 by Jonathan Baumann +## All Rights Reserved. + +context("Splitting functionality, activity-based splitting of networks.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split network ----------------------------------------------------------- + +## * activity-based ------------------------------------------------------------ + +## +## Tests for split.network.activity.based(...) +## + +patrick::with_parameters_test_that("Split a network activity-based (number.edges, number.windows).", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network + author.net = net.builder$get.author.network() + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.edges = 2) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + + ## + ## number.edges (2) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) + ) + results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (2)).") + + ## + ## number.windows (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.windows = 3) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.windows (1)).") + + ## + ## number.windows (2) + ## + + expect_error( + split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10), + info = "Error expected (number.windows (2))." + ) + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * sliding windows + +## +## Tests for split.network.activity.based(...) using sliding windows +## + +patrick::with_parameters_test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE).", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network + author.net = net.builder$get.author.network() + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + + ## + ## number.edges (2) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) + ) + results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10, + sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (2)).") + + ## + ## number.windows (1) (i.e., ignoring sliding windows) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), + "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) + ) + results = split.network.activity.based(author.net, number.windows = 3, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.windows (1)).") + + ## + ## number.windows (2) (i.e., ignoring sliding windows) + ## + + expect_error( + split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10, + sliding.window = TRUE), + info = "Error expected (number.windows (2))." + ) + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +patrick::with_parameters_test_that("Split a network activity-based (number.edges, number.windows, sliding.window = TRUE), continued.", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network and add an additional edge in the end + author.net = net.builder$get.author.network() + author.net = igraph::add_edges(author.net, c("Olaf", "Thomas"), + attr = list(date = get.date.from.string("2020-02-20 20:20:20"))) + + ## + ## number.edges (1) + ## + + ## results + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), + "2016-07-12 16:00:45-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(2, 3)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), + "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:32-2020-02-20 20:20:20" = igraph::subgraph.edges(author.net, c(6, 8)), + "2016-07-12 16:06:32-2020-02-20 20:20:21" = igraph::subgraph.edges(author.net, c(8, 9)) + ) + results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality (number.edges (1)).") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) diff --git a/tests/test-split-network-time-based.R b/tests/test-split-network-time-based.R new file mode 100644 index 00000000..bdcd21d3 --- /dev/null +++ b/tests/test-split-network-time-based.R @@ -0,0 +1,456 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2017-2019 by Claus Hunsen +## Copyright 2018 by Thomas Bock +## Copyright 2020 by Thomas Bock +## Copyright 2018 by Jakob Kronawitter +## Copyright 2022 by Jonathan Baumann +## All Rights Reserved. + +context("Splitting functionality, time-based splitting of networks.") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Split network ----------------------------------------------------------- + +## * time-based ------------------------------------------------------------ + +## * * time period --------------------------------------------------------- + +## +## Tests for split.network.time.based(..., time.period = ...) +## + +patrick::with_parameters_test_that("Split a network time-based (time.period = ...).", { + + ## time period + time.period = "2 mins" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, time.period = "2 mins") + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins), info = "Illegal split.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## +## Tests for split.networks.time.based(..., time.period = ...) +## + +patrick::with_parameters_test_that("Split a list of networks time-based, ", { + + ## time period + time.period = "2 years" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## obtain networks: + ## 1) co-change network + net.builder$update.network.conf(list(author.relation = "cochange")) + net.cochange = net.builder$get.author.network() + ## 2) mail network + net.builder$update.network.conf(list(author.relation = "mail")) + net.mail = net.builder$get.author.network() + + ## split networks + net.split = split.networks.time.based( + networks = list(net.cochange, net.mail), + time.period = time.period, + sliding.window = test.sliding.window + ) + + ## check whether the splitting information of the two split networks are identical + expect_identical(attributes(net.split[[1]]), attributes(net.split[[2]]), info = "Splitting information.") + + ## check whether this works also with one network in the list (if not, an error will occur) + net.split = split.networks.time.based( + networks = list(net.mail), + time.period = time.period, + sliding.window = test.sliding.window + ) + +}, cases.cross.product( + patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) + +## * * time period, sliding windows ---------------------------------------- + +## +## Tests for split.network.time.based(..., time.period = ...) using sliding windows +## + +patrick::with_parameters_test_that("Split a network time-based (time.period = ... , sliding.window = TRUE).", { + + ## time period + time.period = "2 mins" + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 15:59:59-2016-07-12 16:01:59" = igraph::subgraph.edges(author.net, c(2)), + "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:01:59-2016-07-12 16:03:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:03:59-2016-07-12 16:05:59" = igraph::subgraph.edges(author.net, c(3,5)), + "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, time.period = "2 mins", sliding.window = TRUE) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = TRUE), info = "Illegal split.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * bins ---------------------------------------------------------------- + +## +## Tests for split.network.time.based(..., bins = ...) +## + +patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", { + + ## bins + bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", + "2016-07-12 16:04:59", "2016-07-12 17:21:43") + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + ## results + expected = list( + "2016-07-12 15:58:00-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:04:59-2016-07-12 17:21:43" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.conf$update.values(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window), + info = "Illegal split.") + +}, cases.cross.product( + patrick::cases( + "sliding window (ignored): FALSE" = list(test.sliding.window = FALSE), + "sliding window (ignored): TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) + +## * * ranges -------------------------------------------------------------------- + +## +## Test splitting network by ranges. +## + +patrick::with_parameters_test_that("Test splitting network by ranges", { + + + ## bins + bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", + "2016-07-12 16:04:59", "2016-07-12 17:21:43") + ranges = construct.ranges(bins, sliding.window = FALSE, raw = TRUE) + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## retrieve author network + author.net = net.builder$get.author.network() + expected.results = split.network.time.based(author.net, bins = bins) + results = split.network.time.based.by.ranges(author.net, ranges) + + ## check time ranges + expect_equal(names(results), names(ranges), info = "Time ranges.") + + ## check data for all ranges + check.identical = mapply(results, expected.results, FUN = function(r, e) { + return(igraph::identical_graphs(r, e)) + }) + expect_true(all(check.identical), info = "Network equality (split by ranges).") +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## * * window numbers --------------------------------------------------------- + +## +## Tests for split.network.time.based(..., number.windows = ...) +## + +patrick::with_parameters_test_that("Split a network time-based with equal-sized windows (number.windows = ...).", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## + ## simplify = FALSE + ## + + ## retrieve author network + author.net = net.builder$get.author.network() + + expected = list( + "2016-07-12 15:58:59-2016-07-12 16:00:53" = igraph::subgraph.edges(author.net, c(1:2)), + "2016-07-12 16:00:53-2016-07-12 16:02:47" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:02:47-2016-07-12 16:04:41" = igraph::subgraph.edges(author.net, c()), + "2016-07-12 16:04:41-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) + ) + results = split.network.time.based(author.net, number.windows = 4) + + ## check ranges (labels) + expect_equal(names(results), names(expected), info = "Time ranges.") + + ## check networks + check.identical = mapply(results, expected, FUN = function(r, e) { + igraph::identical_graphs(r, e) + }) + expect_true(all(check.identical), info = "Network equality.") + + ## + ## simplify = TRUE + ## + + ## update network configuration + net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) + net.builder$reset.environment() + + ## retrieve author network + author.net = net.builder$get.author.network() + + expect_error(split.network.time.based(author.net, bins = bins), info = "Illegal split.") + +}, patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) +)) + +## +## Tests for split.networks.time.based(..., number.windows = ...) +## + +patrick::with_parameters_test_that("Split a list of networks time-based with equal-sized windows", { + + ## configuration and data objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + proj.conf$update.values(list(pasta = test.pasta, synchronicity = test.synchronicity)) + net.conf = NetworkConf$new() + net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) + project.data = ProjectData$new(proj.conf) + net.builder = NetworkBuilder$new(project.data, net.conf) + + ## obtain networks: + ## 1) co-change network + net.builder$update.network.conf(list(author.relation = "cochange")) + net.cochange = net.builder$get.author.network() + ## 2) mail network + net.builder$update.network.conf(list(author.relation = "mail")) + net.mail = net.builder$get.author.network() + + ## split networks + net.split = split.networks.time.based( + networks = list(net.cochange, net.mail), + number.windows = 3, + sliding.window = test.sliding.window # this parameter should be ignored if number.windows is given + ) + + ## check whether the splitting information of the two split networks are identical + expect_identical(attributes(net.split[[1]]), attributes(net.split[[2]]), info = "Splitting information.") + + ## check whether the splitting information is as expected + expected = list ( + "bins" = c(get.date.from.string("2010-07-12 12:05:41"), + get.date.from.string("2012-07-12 05:25:58"), + get.date.from.string("2014-07-12 22:46:15"), + get.date.from.string("2016-07-12 16:06:33")), + "names" = c("2010-07-12 12:05:41-2012-07-12 05:25:58", + "2012-07-12 05:25:58-2014-07-12 22:46:15", + "2014-07-12 22:46:15-2016-07-12 16:06:33") + ) + + ## R 3.4 fails if this is expect_identical + expect_equal(expected, attributes(net.split[[1]]), info = "Splitting information.") + + ## check whether this works also with one network in the list (if not, an error will occur) + net.split = split.networks.time.based( + networks = list(net.mail), + number.windows = 3, + sliding.window = test.sliding.window + ) + +}, cases.cross.product( + patrick::cases( + "sliding window: FALSE" = list(test.sliding.window = FALSE), + "sliding window: TRUE" = list(test.sliding.window = TRUE) + ), + patrick::cases( + "pasta, synchronicity: FALSE" = list(test.pasta = FALSE, test.synchronicity = FALSE), + "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) + ) +)) diff --git a/tests/test-split.R b/tests/test-split.R deleted file mode 100644 index 193eb020..00000000 --- a/tests/test-split.R +++ /dev/null @@ -1,2052 +0,0 @@ -## This file is part of coronet, which is free software: you -## can redistribute it and/or modify it under the terms of the GNU General -## Public License as published by the Free Software Foundation, version 2. -## -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -## -## Copyright 2017-2019 by Claus Hunsen -## Copyright 2017 by Felix Prasse -## Copyright 2018 by Thomas Bock -## Copyright 2020 by Thomas Bock -## Copyright 2018 by Christian Hechtl -## Copyright 2018 by Jakob Kronawitter -## Copyright 2019 by Anselm Fehnker -## Copyright 2021 by Niklas Schneider -## Copyright 2021 by Johannes Hostert -## All Rights Reserved. - - -context("Splitting functionality.") - -## -## Context -## - -CF.DATA = file.path(".", "codeface-data") -CF.SELECTION.PROCESS = "testing" -CASESTUDY = "test" -ARTIFACT = "feature" - -## use only when debugging this file independently -if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") - - -## -## NOTE -## - -## In this test file, we rather test the raw data contents of the data objects -## instead of the networks that can be constructed from these data items! - - -## -## TODO -## - -## - net.conf$update.values(list(pasta = TRUE, synchronicity = TRUE)) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Split data -------------------------------------------------------------- - -## * time-based ------------------------------------------------------------ - -## * * time period --------------------------------------------------------- - -## -## Tests for split.data.time.based(..., split.basis = 'commits') -## - -test_that("Split a data object time-based (split.basis = 'commits').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.time.based(project.data, time.period = "3 min", - split.basis = "commits", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2016-07-12 15:58:59-2016-07-12 16:01:59", - "2016-07-12 16:01:59-2016-07-12 16:04:59", - "2016-07-12 16:04:59-2016-07-12 16:06:33" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "time-based", - split.length = "3 min", - split.basis = "commits", - split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commits[1:2, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commits[0, ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commits[3:8, ] - ), - commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$commit.messages, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$commit.messages, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages - ), - issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15,29, 47:49), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45:46), ] - ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[rownames(data$mails) == 16, ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) == 17, ] - ), - pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$pasta, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$pasta - ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$synchronicity, - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$synchronicity, - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - - expect_equal(results.data, expected.data, info = "Data for ranges.") - -}) - - -## -## Tests for split.data.time.based(..., split.basis = 'mails') -## - -test_that("Split a data object time-based (split.basis = 'mails').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.time.based(project.data, time.period = "3 years", - split.basis = "mails", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2004-10-09 18:38:13-2007-10-10 12:38:13", - "2007-10-10 12:38:13-2010-10-10 06:38:13", - "2010-10-10 06:38:13-2013-10-10 00:38:13", - "2013-10-10 00:38:13-2016-07-12 16:05:38" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "time-based", - split.length = "3 years", - split.basis = "mails", - split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", - "2013-10-10 00:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commits[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commits[0, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commits[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] - ), - commit.messages = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$commit.messages, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$commit.messages, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$commit.messages, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$commit.messages - ), - issues = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] - ), - mails = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[rownames(data$mails) %in% 1:2, ], - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[rownames(data$mails) %in% 3:12, ], - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 13:17, ] - ), - pasta = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$pasta, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$pasta, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$pasta - ), - synchronicity = list( - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$synchronicity, - "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$synchronicity, - "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$synchronicity, - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) - - -## -## Tests for split.data.time.based(..., split.basis = 'issues') -## - -test_that("Split a data object time-based (split.basis = 'issues').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.time.based(project.data, time.period = "2 years", - split.basis = "issues", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2013-04-21 23:52:09-2015-04-22 11:52:09", - "2015-04-22 11:52:09-2017-04-21 23:52:09", - "2017-04-21 23:52:09-2017-05-23 12:32:40" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "time-based", - split.length = "2 years", - split.basis = "issues", - split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commits[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commits, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commits[0, ] - ), - commit.messages = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$commit.messages, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$commit.messages, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages - ), - issues = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 35:36, ] - ), - mails = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[rownames(data$mails) %in% 14:17, ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] - ), - pasta = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$pasta, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta - ), - synchronicity = list( - "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$synchronicity, - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$synchronicity, - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - - expect_equal(results.data, expected.data, info = "Data for ranges.") - -}) - -## * * bins ---------------------------------------------------------------- - -## -## Tests for split.data.time.based(..., bins = ...) -## - -test_that("Split a data object time-based (bins = ... ).", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.time.based(project.data, bins = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), - split.basis = "mails", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2016-01-01 00:00:00-2016-12-31 23:59:59" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), - split.basis = NULL, - split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - - ## check data for all ranges - expected.data = list( - commits = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commits - ), - commit.messages = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages - ), - issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ] - ), - mails = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] - ), - pasta = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$pasta - ), - synchronicity = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) - -## * * ranges -------------------------------------------------------------- - -## -## Test splitting data by network names. -## - -test_that("Test splitting data by networks", { - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - - ## construct project data - project.data = ProjectData$new(proj.conf) - - ## split data - mybins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", - "2016-07-12 16:05:00", "2016-10-05 09:00:00")) - input.data = split.data.time.based(project.data, bins = mybins) - input.data.network = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) - - ## split data by networks - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete") - results = lapply(aggregation.level, function(level) - split.data.by.networks(input.data.network, project.data, level) - ) - names(results) = aggregation.level - - ## construct expected ranges - expected.ranges = list( - range = c("2016-07-12 15:00:00-2016-07-12 16:00:00", - "2016-07-12 16:00:00-2016-07-12 16:05:00", - "2016-07-12 16:05:00-2016-10-05 09:00:00"), - cumulative = c("2016-07-12 15:00:00-2016-07-12 16:00:00", - "2016-07-12 15:00:00-2016-07-12 16:05:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00"), - all.ranges = c("2016-07-12 15:00:00-2016-10-05 09:00:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00", - "2016-07-12 15:00:00-2016-10-05 09:00:00"), - project.cumulative = c("2004-10-09 18:38:13-2016-07-12 16:00:00", - "2004-10-09 18:38:13-2016-07-12 16:05:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00"), - project.all.ranges = c("2004-10-09 18:38:13-2016-10-05 09:00:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00", - "2004-10-09 18:38:13-2016-10-05 09:00:00"), - complete = c("2004-10-09 18:38:13-2017-05-23 12:32:40", - "2004-10-09 18:38:13-2017-05-23 12:32:40", - "2004-10-09 18:38:13-2017-05-23 12:32:40") - ) - - ## test the ranges - test.each.network = function(aggregation.level) { - result.data = results[[aggregation.level]] - expected.range.names = expected.ranges[[aggregation.level]] - - lapply(seq_along(result.data), function(i) { - result.entry = result.data[[i]] - - expect_true(igraph::identical_graphs(result.entry[["network"]], input.data.network[[i]])) - expect_equal(result.entry[["data"]]$get.range(), expected.range.names[[i]]) - }) - } - lapply(aggregation.level, test.each.network) -}) - -## -## Test splitting data by ranges. -## - -test_that("Test splitting data by ranges", { - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - - ## construct project data - project.data = ProjectData$new(proj.conf) - - ## split data - my.bins = get.date.from.string(c("2016-07-12 15:00:00", "2016-07-12 16:00:00", - "2016-07-12 16:05:00", "2016-10-05 09:00:00")) - my.ranges = construct.ranges(my.bins, sliding.window = FALSE) - expected.results = split.data.time.based(project.data, bins = my.bins) - results = split.data.time.based.by.ranges(project.data, my.ranges) - - ## check time ranges - expect_equal(names(results), my.ranges, info = "Time ranges.") - - ## check data for all ranges - expected.data = list( - commits = lapply(expected.results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(expected.results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(expected.results, function(cf.data) cf.data$get.issues()), - mails = lapply(expected.results, function(cf.data) cf.data$get.mails()), - pasta = lapply(expected.results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(expected.results, function(cf.data) cf.data$get.synchronicity()) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges.") -}) - -## * activity-based -------------------------------------------------------- - -## -## Tests for split.data.activity.based(..., activity.type = 'commits') -## - -test_that("Split a data object activity-based (activity.type = 'commits').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.activity.based(project.data, activity.amount = 3, - activity.type = "commits", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:10", - "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:32-2016-07-12 16:06:33" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (activity.amount).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 3, - split.basis = "commits", - split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:8, ] - ), - commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages - ), - issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] - ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] - ), - pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta - ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (activity.amount).") - - ## - ## split by too-large activity amount - ## - - ## split data - results = split.data.activity.based(project.data, activity.amount = nrow(data$commits) + 10, - activity.type = "commits", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:33" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (too-large activity amount).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 18, - split.basis = "commits", - split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commits - ), - commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages - ), - issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] - ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[rownames(data$mails) %in% 16:17, ] - ), - pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$pasta - ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges for too-large activity amount (activity.amount).") - - ## - ## split by number of windows - ## - - ## split data - results = split.data.activity.based(project.data, number.windows = 2, - activity.type = "commits", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2016-07-12 15:58:59-2016-07-12 16:06:20", - "2016-07-12 16:06:20-2016-07-12 16:06:33" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (number.windows).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 4, - split.basis = "commits", - split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commits[1:4, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ] - ), - commit.messages = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages - ), - issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] - ), - mails = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ] - ), - pasta = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta - ), - synchronicity = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - - ## too large number of windows - - expect_error( - split.data.activity.based(project.data, activity.type = "commits", number.windows = nrow(project.data$get.commits.unfiltered()) + 10), - info = "Error expected (number.windows) (1)." - ) - - expect_error( - split.data.activity.based(project.data, activity.type = "commits", number.windows = 0), - info = "Error expected (number.windows) (2)." - ) - -}) - - -## -## Tests for split.data.activity.based(..., activity.type = 'mails') -## - -test_that("Split a data object activity-based (activity.type = 'mails').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.activity.based(project.data, activity.amount = 3, - activity.type = "mails", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2004-10-09 18:38:13-2010-07-12 11:05:35", - "2010-07-12 11:05:35-2010-07-12 12:05:41", - "2010-07-12 12:05:41-2010-07-12 12:05:44", - "2010-07-12 12:05:44-2016-07-12 15:58:40", - "2016-07-12 15:58:40-2016-07-12 16:05:37", - "2016-07-12 16:05:37-2016-07-12 16:05:38" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 3, - split.basis = "mails", - split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", - "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", - "2016-07-12 16:05:38"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commits[0, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commits[0, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commits[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commits[0, ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commits[1:2, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commits[0, ] - ), - commit.messages = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$commit.messages, - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$commit.messages, - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$commit.messages, - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$commit.messages, - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$commit.messages, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$commit.messages - ), - issues = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] - ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$mails[rownames(data$mails) %in% 1:3, ], - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$mails[rownames(data$mails) %in% 4:6, ], - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$mails[rownames(data$mails) %in% 7:9, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$mails[rownames(data$mails) %in% 10:12, ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$mails[rownames(data$mails) %in% 14:16, ], - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 17, ] - ), - pasta = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$pasta, - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$pasta, - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$pasta, - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$pasta, - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$pasta, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$pasta - ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$synchronicity, - "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$synchronicity, - "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$synchronicity, - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$synchronicity, - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$synchronicity, - "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges.") - - ## - ## split by too-large activity amount - ## - - ## split data - results = split.data.activity.based(project.data, activity.amount = nrow(data$mails) + 10, - activity.type = "mails", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2004-10-09 18:38:13-2016-07-12 16:05:38" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (too-large activity amount).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 26, - split.basis = "mails", - split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commits[1:2, ] - ), - commit.messages = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages - ), - issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] - ), - mails = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails - ), - pasta = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$pasta - ), - synchronicity = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") - - ## - ## split by number of windows - ## - - ## split data - results = split.data.activity.based(project.data, number.windows = 2, - activity.type = "mail", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2004-10-09 18:38:13-2010-07-12 12:05:43", - "2010-07-12 12:05:43-2016-07-12 16:05:38" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (number.windows).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 8, - split.basis = "mails", - split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commits[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commits[1:2, ] - ), - commit.messages = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$commit.messages, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$commit.messages - ), - issues = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] - ), - mails = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$mails[rownames(data$mails) %in% 1:8, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$mails[rownames(data$mails) %in% 9:17, ] - ), - pasta = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$pasta, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$pasta - ), - synchronicity = list( - "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$synchronicity, - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - - ## too large number of windows - - expect_error( - split.data.activity.based(project.data, activity.type = "mails", number.windows = nrow(project.data$get.mails()) + 10), - info = "Error expected (number.windows) (1)." - ) - - expect_error( - split.data.activity.based(project.data, activity.type = "mails", number.windows = 0), - info = "Error expected (number.windows) (2)." - ) -}) - - -## -## Tests for split.data.activity.based(..., activity.type = 'issues') -## - -test_that("Split a data object activity-based (activity.type = 'issues').", { - - ## configuration objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - - ## data object - project.data = ProjectData$new(proj.conf) - data = list( - commits = project.data$get.commits.unfiltered(), - commit.messages = project.data$get.commit.messages(), - issues = project.data$get.issues(), - mails = project.data$get.mails(), - pasta = project.data$get.pasta(), - synchronicity = project.data$get.synchronicity() - ) - - ## split data - results = split.data.activity.based(project.data, activity.amount = 9, - activity.type = "issues", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2013-04-21 23:52:09-2013-05-25 06:22:23", - "2013-05-25 06:22:23-2016-07-12 15:59:59", - "2016-07-12 15:59:59-2016-07-12 16:06:30", - "2016-07-12 16:06:30-2016-10-05 15:30:02", - "2016-10-05 15:30:02-2017-05-23 12:32:40" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 9, - split.basis = "issues", - split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", - "2016-07-12 16:06:30", "2016-10-05 15:30:02", "2017-05-23 12:32:40"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commits[1, ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] - ), - commit.messages = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commit.messages, - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages - ), - issues = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 43:44, 37:38), ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] - ), - mails = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[rownames(data$mails) %in% 16:17, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] - ), - pasta = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$pasta, - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta - ), - synchronicity = list( - "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$synchronicity, - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges.") - - ## - ## split by too-large activity amount - ## - - ## split data - results = split.data.activity.based(project.data, activity.amount = nrow(data$issues) + 10, - activity.type = "issues", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2013-04-21 23:52:09-2017-05-23 12:32:40" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (too-large activity amount).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 59, - split.basis = "issues", - split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commits - ), - commit.messages = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$commit.messages - ), - issues = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$issues - ), - mails = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 14:17, ] - ), - pasta = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$pasta - ), - synchronicity = list( - "2013-04-21 23:52:09-2017-05-23 12:32:40" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (too-large activity amount).") - - ## - ## split by number of windows - ## - - ## split data - results = split.data.activity.based(project.data, number.windows = 2, - activity.type = "issues", sliding.window = FALSE) - - ## check time ranges - expected = c( - "2013-04-21 23:52:09-2016-07-12 16:02:02", - "2016-07-12 16:02:02-2017-05-23 12:32:40" - ) - lapply(results, function(res) { - expect_equal(res$get.project.conf()$get.value("ranges"), expected, - info = "Time ranges (number.windows).") - }) - - ## This value should not change, so we compare it with the default, which is `c("v1-v2", "v2-v3")`. - expect_equal(proj.conf$get.value("ranges"), c("v1-v2", "v2-v3"), - info = "Splitting must mot modify the original ProjectConf.") - - ## test that the config contains the correct splitting information - expected.config = list( - split.type = "activity-based", - split.length = 21, - split.basis = "issues", - split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:02:02", "2017-05-23 12:32:40"), - split.revision.dates = NULL - ) - lapply(results, function(res) { - actual = lapply(names(expected.config), res$get.project.conf()$get.value) - names(actual) = names(expected.config) - expect_equal(expected.config, actual) - }) - - ## check data for all ranges - expected.data = list( - commits = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commits[1:2, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commits[3:8, ] - ), - commit.messages = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commit.messages, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commit.messages - ), - issues = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28, 37:40, 43:44), ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36, 41:42, 45:49), ] - ), - mails = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$mails[rownames(data$mails) %in% 14:15, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$mails[rownames(data$mails) %in% 16:17, ] - ), - pasta = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$pasta, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$pasta - ), - synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$synchronicity, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$synchronicity - ) - ) - results.data = list( - commits = lapply(results, function(cf.data) cf.data$get.commits.unfiltered()), - commit.messages = lapply(results, function(cf.data) cf.data$get.commit.messages()), - issues = lapply(results, function(cf.data) cf.data$get.issues()), - mails = lapply(results, function(cf.data) cf.data$get.mails()), - pasta = lapply(results, function(cf.data) cf.data$get.pasta()), - synchronicity = lapply(results, function(cf.data) cf.data$get.synchronicity()) - ) - expect_equal(results.data, expected.data, info = "Data for ranges (number.windows).") - - ## too large number of windows - - expect_error( - split.data.activity.based(project.data, activity.type = "issues", number.windows = nrow(project.data$get.issues()) + 10), - info = "Error expected (number.windows) (1)." - ) - - expect_error( - split.data.activity.based(project.data, activity.type = "issues", number.windows = 0), - info = "Error expected (number.windows) (2)." - ) -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Split network ----------------------------------------------------------- - -## * time-based ------------------------------------------------------------ - -## * * time period --------------------------------------------------------- - -## -## Tests for split.network.time.based(..., time.period = ...) -## - -test_that("Split a network time-based (time.period = ...).", { - - ## time period - time.period = "2 mins" - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## - ## simplify = FALSE - ## - - ## retrieve author network - author.net = net.builder$get.author.network() - - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), - "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:04:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(3:8)) - ) - results = split.network.time.based(author.net, time.period = "2 mins") - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges.") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality.") - - ## - ## simplify = TRUE - ## - - ## update network configuration - net.builder$update.network.conf(list(author.relation = "cochange", simplify = TRUE)) - net.builder$reset.environment() - - ## retrieve author network - author.net = net.builder$get.author.network() - - expect_error(split.network.time.based(author.net, bins = bins), info = "Illegal split.") - -}) - -## -## Tests for split.networks.time.based(..., time.period = ...) -## - -patrick::with_parameters_test_that("Split a list of networks time-based, ", { - - ## time period - time.period = "2 years" - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(simplify = FALSE, author.directed = TRUE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## obtain networks: - ## 1) co-change network - net.builder$update.network.conf(list(author.relation = "cochange")) - net.cochange = net.builder$get.author.network() - ## 2) mail network - net.builder$update.network.conf(list(author.relation = "mail")) - net.mail = net.builder$get.author.network() - - ## split networks - net.split = split.networks.time.based( - networks = list(net.cochange, net.mail), - time.period = time.period, - sliding.window = test.sliding.window - ) - - ## check whether the splitting information of the two split networks are identical - expect_identical(attributes(net.split[[1]]), attributes(net.split[[2]]), info = "Splitting information.") - - ## check whether this works also with one network in the list (if not, an error will occur) - net.split = split.networks.time.based( - networks = list(net.mail), - time.period = time.period, - sliding.window = test.sliding.window - ) - -}, patrick::cases( - "sliding window: FALSE" = list(test.sliding.window = FALSE), - "sliding window: TRUE" = list(test.sliding.window = TRUE) -)) - -## * * bins ---------------------------------------------------------------- - -## -## Tests for split.network.time.based(..., bins = ...) -## - -patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", { - - ## bins - bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", - "2016-07-12 16:04:59", "2016-07-12 17:21:43") - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## - ## simplify = FALSE - ## - - ## retrieve author network - author.net = net.builder$get.author.network() - - ## results - expected = list( - "2016-07-12 15:58:00-2016-07-12 16:00:59" = igraph::subgraph.edges(author.net, c(1:2)), - "2016-07-12 16:00:59-2016-07-12 16:02:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:02:59-2016-07-12 16:04:59" = igraph::subgraph.edges(author.net, c()), - "2016-07-12 16:04:59-2016-07-12 17:21:43" = igraph::subgraph.edges(author.net, c(3:8)) - ) - results = split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges.") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality.") - - ## - ## simplify = TRUE - ## - - ## update network configuration - net.conf$update.values(list(author.relation = "cochange", simplify = TRUE)) - net.builder$reset.environment() - - ## retrieve author network - author.net = net.builder$get.author.network() - - expect_error(split.network.time.based(author.net, bins = bins, sliding.window = test.sliding.window), - info = "Illegal split.") - -}, patrick::cases( - "sliding window (ignored): FALSE" = list(test.sliding.window = FALSE), - "sliding window (ignored): TRUE" = list(test.sliding.window = TRUE) -)) - -## * * ranges -------------------------------------------------------------------- - -## -## Test splitting network by ranges. -## - -test_that("Test splitting network by ranges", { - - - ## bins - bins = c("2016-07-12 15:58:00", "2016-07-12 16:00:59", "2016-07-12 16:02:59", - "2016-07-12 16:04:59", "2016-07-12 17:21:43") - ranges = construct.ranges(bins, sliding.window = FALSE, raw = TRUE) - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## retrieve author network - author.net = net.builder$get.author.network() - expected.results = split.network.time.based(author.net, bins = bins) - results = split.network.time.based.by.ranges(author.net, ranges) - - ## check time ranges - expect_equal(names(results), names(ranges), info = "Time ranges.") - - ## check data for all ranges - check.identical = mapply(results, expected.results, FUN = function(r, e) { - return(igraph::identical_graphs(r, e)) - }) - expect_true(all(check.identical), info = "Network equality (split by ranges).") -}) - -## * activity-based ------------------------------------------------------------ - -## -## Tests for split.network.activity.based(...) -## - -test_that("Split a network activity-based (number.edges, number.windows).", { - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - - ## retrieve author network - author.net = net.builder$get.author.network() - - ## - ## number.edges (1) - ## - - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2)), - "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), - "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), - "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) - ) - results = split.network.activity.based(author.net, number.edges = 2) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality (number.edges (1)).") - - ## - ## number.edges (2) - ## - - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(1:igraph::ecount(author.net))) - ) - results = split.network.activity.based(author.net, number.edges = igraph::ecount(author.net) + 10) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality (number.edges (2)).") - - ## - ## number.windows (1) - ## - - ## results - expected = list( - "2016-07-12 15:58:59-2016-07-12 16:05:41" = igraph::subgraph.edges(author.net, c(1, 2, 3)), - "2016-07-12 16:05:41-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 5, 7)), - "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) - ) - results = split.network.activity.based(author.net, number.windows = 3) - - ## check ranges (labels) - expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Network equality (number.windows (1)).") - - ## - ## number.windows (2) - ## - - expect_error( - split.network.activity.based(author.net, number.windows = igraph::ecount(author.net) + 10), - info = "Error expected (number.windows (2))." - ) - -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Split raw data (data and networks by bins) ------------------------------ - -## -## Tests for split.data.by.bins and split.network.by.bins -## - -test_that("Split network and data on low level (split.data.by.bins, split.network.by.bins).", { - - length.dates = 15 - length.bins = 5 - - ## generate dates - dates = c("2000-01-25", "2000-01-23", "2000-01-15", "2000-01-27", "2000-01-13", - "2000-01-03", "2000-01-05", "2000-01-29", "2000-01-19", "2000-01-01", - "2000-01-11", "2000-01-07", "2000-01-21", "2000-01-09", "2000-01-17") - # ## ## generated with: - # sprintf("c(\"%s\")", paste( - # get.date.string(sample( - # seq.POSIXt(get.date.from.string("2000-01-01"), get.date.from.string("2000-02-01"), by = "1 days"), - # length.dates, - # replace = FALSE - # )), collapse = "\", \"")) - - ## generate bins - bins = seq_len(length.bins) - bins.vector = c("1", "3", "5", "4", "1", "3", "1", "3", "2", "5", "4", "2", "4", "3", "5") - ## ## generated with: - ## sprintf("c(\"%s\")", paste( sample(bins, size = length.dates, replace = TRUE), collapse = "', '") ) - - ## - ## split.data.by.bins - ## - - ## generate data frame with dates and IDs - df = data.frame( - id = 1:length.dates, - date = dates - ) - - ## results - expected = list( - "1" = df[ c(1, 5, 7), ], - "2" = df[ c(9, 12), ], - "3" = df[ c(2, 6, 8, 14), ], - "4" = df[ c(4, 11, 13), ], - "5" = df[ c(3, 10, 15), ] - ) - results = split.data.by.bins(df, bins.vector) - - ## check result - expect_equal(results, expected, info = "Split data by bins.") - - ## - ## split.network.by.bins - ## - - ## generate data frame with dates and IDs - vcount = 4 - net = igraph::make_empty_graph(n = vcount, directed = FALSE) - for (e.id in seq_len(length.dates)) { - net = net + igraph::edge( - sample(seq_len(vcount), 1), # from vertex - sample(seq_len(vcount), 1), # to vertex - date = get.date.from.string(dates[e.id]) - ) - } - - ## results - expected = list( - igraph::subgraph.edges(net, c(1, 5, 7)), - igraph::subgraph.edges(net, c(9, 12)), - igraph::subgraph.edges(net, c(2, 6, 8, 14)), - igraph::subgraph.edges(net, c(4, 11, 13)), - igraph::subgraph.edges(net, c(3, 10, 15)) - ) - results = split.network.by.bins(net, bins, bins.vector) - - ## check networks - check.identical = mapply(results, expected, FUN = function(r, e) { - igraph::identical_graphs(r, e) - }) - expect_true(all(check.identical), info = "Split network by bins (network equality).") - -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Bin identification ------------------------------------------------------ - -## -## Tests for split.get.bins.time.based and split.get.bins.activity.based -## - -test_that("Get bins for network and data on low level (split.get.bins.time.based, split.get.bins.activity.based).", { - - length.dates = 15 - length.bins = 5 - - ## generate dates - dates = c("2000-01-25", "2000-01-23", "2000-01-15", "2000-01-27", "2000-01-13", - "2000-01-03", "2000-01-05", "2000-01-29", "2000-01-19", "2000-01-01", - "2000-01-11", "2000-01-07", "2000-01-21", "2000-01-09", "2000-01-17") - dates.posixct = get.date.from.string(dates) - ## ## generated with: - ## sprintf("c(\"%s\")", paste( - ## get.date.string(sample( - ## seq.POSIXt(get.date.from.string("2000-01-01"), get.date.from.string("2000-02-01"), by = "1 days"), - ## length.dates, - ## replace = FALSE - ## )), collapse = "\", \"")) - - ## - ## split.get.bins.time.based (1) - ## - - ## results - expected.bins = c("2000-01-01 00:00:00", "2000-01-11 00:00:00", "2000-01-21 00:00:00", "2000-01-29 00:00:01") - expected = list( - vector = factor(head(expected.bins, -1))[c(3, 3, 2, 3, 2, - 1, 1, 3, 2, 1, - 2, 1, 3, 1, 2)], - bins = expected.bins - ) - results = split.get.bins.time.based(dates.posixct, "10 days") - - ## check result - expect_equal(results, expected, info = "split.get.bins.time.based (1)") - - ## - ## split.get.bins.time.based (2) - ## - - ## results - expected.bins = c("2000-01-01 00:00:00", "2000-01-29 00:00:01") - expected = list( - vector = factor(head(expected.bins, -1))[ rep(1, length.dates) ], - bins = expected.bins - ) - results = split.get.bins.time.based(dates.posixct, "1 year") - - ## check result - expect_equal(results, expected, info = "split.get.bins.time.based (2)") - - ## - ## split.get.bins.time.based (3) - ## - - ## results - dates.unround = get.date.from.string(c("2004-01-01 00:00:00", "2004-01-01 00:00:14", "2004-01-01 00:00:22")) - expected.bins = c("2004-01-01 00:00:00", "2004-01-01 00:00:05", "2004-01-01 00:00:10", - "2004-01-01 00:00:15", "2004-01-01 00:00:20", "2004-01-01 00:00:23") # adding 4.2 seconds each - expected = list( - vector = factor(head(expected.bins, -1))[ c(1, 3, 5) ], - bins = expected.bins - ) - results = split.get.bins.time.based(dates.unround, number.windows = length.bins) - - ## check result - expect_equal(results, expected, info = "split.get.bins.time.based (3)") - - ## - ## split.get.bins.activity.based (1) - ## - - ## construct data.frame - df = data.frame(date = dates.posixct, id = seq_len(length.dates)) - df = df[ order(df$date), ] - - ## results - expected = list( - vector = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4), - bins = c("2000-01-01 00:00:00", "2000-01-09 00:00:00", "2000-01-17 00:00:00", "2000-01-25 00:00:00", "2000-01-29 00:00:01") - ) - results = split.get.bins.activity.based(df, "id", 4) - - ## check result - expect_equal(results, expected, info = "split.get.bins.activity.based (1)") - - ## - ## split.get.bins.activity.based (2) - ## - - ## construct data.frame - df = data.frame(date = dates.posixct, id = seq_len(length.dates)) - df = df[ order(df$date), ] - - ## results - expected = list( - vector = rep(1, length.out = length.dates), - bins = c("2000-01-01 00:00:00", "2000-01-29 00:00:01") - ) - results = split.get.bins.activity.based(df, "id", nrow(df) + 10) - - ## check result - expect_equal(results, expected, info = "split.get.bins.activity.based (2)") - -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Consistency tests ------------------------------------------------------- - -## -## Tests for consistency of data and network time-based splitting -## - -test_that("Check consistency of data and network time-based splitting.", { - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - - ## retrieve project data and network builder - project.data = ProjectData$new(proj.conf) - net.builder = NetworkBuilder$new(project.data, net.conf) - ## retrieve author network - project.net = net.builder$get.author.network() - - ## set time period for splitting - time.period = "7 mins" - - ## split data - results.data = split.data.time.based(project.data, time.period = time.period, split.basis = "commits") - results.data.network = lapply(results.data, function(d) NetworkBuilder$new(d, net.conf)$get.author.network()) - - ## split network - results.network = split.network.time.based(project.net, time.period = time.period) - - ## check ranges - expect_equal(names(results.network), names(results.data.network), info = "Range equality.") - - ## the chosen time-window size results in the following condition: - ## 1) Thomas and Karl only appear in the second time window, both working on the base feature. - ## 2) Olaf only appears in the first time window, working on the base feature as the only author. - ## Thus, when splitting the project-level network, there are edges from Olaf to Karl and Thomas, - ## crossing the time-window border. Hence, when deleting the respective vertices from the networks, - ## the data-based networks should match the network-based networks. - results.network[[1]] = igraph::delete.vertices(results.network[[1]], c("Thomas", "Karl")) - results.network[[2]] = igraph::delete.vertices(results.network[[2]], c("Olaf")) - check.identical = mapply(results.data.network, results.network, FUN = function(d, n) { - igraph::identical_graphs(d, n) - }) - expect_true(all(check.identical), info = "Network equality.") - -}) - - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Unification of range names ---------------------------------------------- - -## -## Tests for duplicate range names -## - -test_that("Check and correct duplicate range names during network activity-based splitting.", { - - ## define dates for edges and the resulting changes - dates = get.date.from.string(c( - "2000-01-01 01:00:00", "2001-01-01 12:00:00", - - "2001-01-01 12:00:00", "2001-01-01 12:00:00", - "2001-01-01 12:00:00", "2001-01-01 12:00:00", - "2001-01-01 12:00:00", "2001-01-01 12:00:00", - - "2002-01-01 12:00:00", "2002-01-01 12:00:00", - "2002-01-01 12:00:00", "2002-01-01 12:00:00", - "2002-01-01 12:00:00", "2002-01-01 12:00:00", - "2002-01-01 12:00:00", "2002-01-01 12:00:00", - - "2002-01-01 12:00:00", "2003-01-01 12:00:00" - )) - expected.ranges = c( - "2000-01-01 01:00:00-2001-01-01 12:00:00", - - "2001-01-01 12:00:00-2001-01-01 12:00:00", - "2001-01-01 12:00:00-2001-01-01 12:00:00", - - "2001-01-01 12:00:00-2002-01-01 12:00:00", - - "2002-01-01 12:00:00-2002-01-01 12:00:00", - "2002-01-01 12:00:00-2002-01-01 12:00:00", - "2002-01-01 12:00:00-2002-01-01 12:00:00", - "2002-01-01 12:00:00-2002-01-01 12:00:00", - - "2002-01-01 12:00:00-2003-01-01 12:00:01" - ) - expected.ranges.corrected = c( - "2000-01-01 01:00:00-2001-01-01 12:00:00", - - "2001-01-01 12:00:00-2001-01-01 12:00:00 (1)", - "2001-01-01 12:00:00-2001-01-01 12:00:00 (2)", - - "2001-01-01 12:00:00-2002-01-01 12:00:00", - - "2002-01-01 12:00:00-2002-01-01 12:00:00 (1)", - "2002-01-01 12:00:00-2002-01-01 12:00:00 (2)", - "2002-01-01 12:00:00-2002-01-01 12:00:00 (3)", - "2002-01-01 12:00:00-2002-01-01 12:00:00 (4)", - - "2002-01-01 12:00:00-2003-01-01 12:00:01" - ) - - ## construct a small network - net = igraph::make_empty_graph(directed = FALSE) + - igraph::vertices(c("A", "B")) + - igraph::edges(rep(c("A", "B"), times = length(dates))) - ## set some date attributes that are appropriate for the test case - net = igraph::set.edge.attribute(net, "date", value = dates) - - ## define split arguments - split.function = split.network.activity.based - split.activity.amount = 2 - split.arguments = list(network = net, number.edges = split.activity.amount, sliding.window = FALSE) - - ## check for issued warning - expect_output( - do.call(split.function, split.arguments), - "WARNING::Due to the splitting, there are duplicated range names.", - fixed = TRUE, - info = "Generate warning." - ) - - ## check range names - net.split = do.call(split.function, split.arguments) - ranges = names(net.split) - expect_equal(ranges, expected.ranges, info = "Ranges (original).") - - ## correct ranges - ranges.corrected = split.unify.range.names(ranges) - expect_equal(ranges.corrected, expected.ranges.corrected, info = "Ranges (unified).") - - - ## Arbitrary range names (1) - ranges = c("A-B", "B-C", "C-D") - expected = c("A-B", "B-C", "C-D") - result = split.unify.range.names(ranges) - expect_identical(result, expected, info = "Arbitrary ranges (1).") - - ## Arbitrary range names (2) - ranges = c("A-B", "A-B", "B-C", "B-C", "C-D") - expected = c("A-B (1)", "A-B (2)", "B-C (1)", "B-C (2)", "C-D") - result = split.unify.range.names(ranges) - expect_identical(result, expected, info = "Arbitrary ranges (2).") - - ## Arbitrary range names (3) - ranges = c("A-B", "A-B", "B-C", "A-B", "B-C") - expected = c("A-B (1)", "A-B (2)", "B-C (1)", "A-B (1)", "B-C (1)") - result = split.unify.range.names(ranges) - expect_identical(result, expected, info = "Arbitrary ranges (3).") - - ## Arbitrary range names (4) - ranges = c("A-B", "A-B", "B-C", "C-D", "C-D") - expected = c("A-B (1)", "A-B (2)", "B-C", "C-D (1)", "C-D (2)") - result = split.unify.range.names(ranges) - expect_identical(result, expected, info = "Arbitrary ranges (4).") - - ## - ## the removal duplicate ranges - ## - - df = data.frame(date = dates, id = 1:length(dates)) - expected = expected.ranges[c(1, 4, 9)] - result = construct.ranges( - split.get.bins.activity.based(df, "id", activity.amount = split.activity.amount, remove.duplicate.bins = TRUE)[["bins"]], - sliding.window = FALSE - ) - expect_identical(result, expected, info = "Removal of duplicate ranges.") - -}) diff --git a/tests/testing-utils.R b/tests/testing-utils.R new file mode 100644 index 00000000..71ac36f7 --- /dev/null +++ b/tests/testing-utils.R @@ -0,0 +1,135 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2022 by Jonathan Baumann +## All Rights Reserved. + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Libraries --------------------------------------------------------------- + +requireNamespace("patrick") +requireNamespace("igraph") + +#' Construct the 'cross product' of two patrick::cases objects. +#' Each case of the first object is combined with each case of the second, +#' test names are joined with a comma. +#' The variable names in the two cases objects are assumed to be disjoint. +#' +#' Calls to this function can be nested to generate cross products of more cases. +#' +#' Example: +#' +#' joined.cases = cases.cross.product(patrick::cases( +#' "sliding window: FALSE" = list(test.sliding.window = FALSE), +#' "sliding window: TRUE" = list(test.sliding.window = TRUE) +#' ), patrick::cases( +#' "PaStA: FALSE" = list(test.pasta = FALSE), +#' "PaStA: TRUE" = list(test.pasta = TRUE) +#' )) +#' +#' yields the following tibble: +#' +#' .test_name test.sliding.window test.pasta +#' 1 sliding window: FALSE, PaStA: FALSE FALSE FALSE +#' 2 sliding window: FALSE, PaStA: TRUE FALSE TRUE +#' 3 sliding window: TRUE, PaStA: FALSE TRUE FALSE +#' 4 sliding window: TRUE, PaStA: TRUE TRUE TRUE +#' +#' Which can be used like this: +#' patrick::with_parameters_test_that( {...}, joined.cases) +#' +#' @param cases.1 a patrick::cases object (or compatible tibble). +#' @param cases.2 a patrick::cases object (or compatible tibble). +#' +#' @returns the cross product of the two patrick::cases objects, as a tibble. +cases.cross.product = function (cases.1, cases.2) { + ## patrick::cases are tibbles with a column named '.test_name' and columns for each variable. + + ## Creating an empty tibble with matching types is not trivial. + ## Therefore, the tibble is created with the first row of data. + result = NULL + is.first = TRUE + + for (id.1 in seq_len(nrow(cases.1))){ + row.1 = cases.1[id.1,] + for (id.2 in seq_len(nrow(cases.2))){ + row.2 = cases.2[id.2,] + + test.name.1 = row.1[[".test_name"]] + test.name.2 = row.2[[".test_name"]] + ## The new test name consists of both previous names, joined with a comma. + test.name.combined = paste(test.name.1, test.name.2, sep = ", ") + + ## Select everything from 'row.1' and everything but '.test_name' from 'row.2'. + row.combined = cbind(row.1, subset(row.2, select = -.test_name)) + ## Set the new combined test name + row.combined[[".test_name"]] = test.name.combined + + ## If this is the first row, we need to create the tibble. + ## Otherwise, we add a row. + if (is.first) { + result = row.combined + is.first = FALSE + } else { + result = tibble::add_case(result, row.combined) + } + } + } + + return(result) +} + +#' Remove the row names of each data frame in a list of dataframes. +#' Useful for comparing data where row names are allowed to change. +#' +#' Note that removing row names causes them to be set to increasing numbers starting at one, +#' so this function may have to be called on both lists involved in a comparison. +#' +#' @param data a list of dataframes +#' +#' @return the list of dataframes, but without row names +remove.row.names.from.data = function(data) { + return( + lapply(data, function (df) { + row.names(df) = NULL + return(df) + }) + ) +} + +#' Remove the row names of all dataframes in a list of lists of dataframes, for example a list of range data objects. +#' @seealso remove.row.names.from.data +#' +#' @param list.of.lists.of.dfs a list of lists of dataframes +#' +#' @return the list of lists of dataframes, but without row names for the data frames +remove.row.names.from.inner.list.of.dfs = function(list.of.lists.of.dfs) { + return(lapply(list.of.lists.of.dfs, remove.row.names.from.data)) +} + +#' Compare edges and vertices of two networks +#' +#' @param network.expected the expected network +#' @param network.actual the actual network +compare.networks = function(network.expected, network.actual) { + ## TODO as soon as the bug in igraph is fixed switch to the expect_true function below + # expect_true(igraph::identical_graphs(network.expected, network.actual)) + expected.edges = igraph::as_data_frame(network.expected, what = "edges") + expected.vertices = igraph::as_data_frame(network.expected, what = "vertices") + + actual.edges = igraph::as_data_frame(network.actual, what = "edges") + actual.vertices = igraph::as_data_frame(network.actual, what = "vertices") + + expect_identical(expected.edges, actual.edges, info = "network edges") + expect_identical(expected.vertices, actual.vertices, info = "network vertices") +} diff --git a/util-conf.R b/util-conf.R index bb8b518c..a34e24e8 100644 --- a/util-conf.R +++ b/util-conf.R @@ -25,6 +25,7 @@ ## Copyright 2020-2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -119,7 +120,9 @@ Conf = R6::R6Class("Conf", existing = TRUE, updatable = TRUE, type = class(value) %in% attribute[["type"]], - allowed = + ## if 'allowed' is not defined for this attribute, any + ## value of the correct type should be accepted. + allowed = is.null(attribute[["allowed"]]) || if (attribute[["type"]] == "numeric" && length(attribute[["allowed"]]) == 1) { value <= attribute[["allowed"]] } else { @@ -449,6 +452,17 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, type = "logical", allowed = c(TRUE, FALSE), allowed.number = 1 + ), + custom.event.timestamps.file = list( + default = NA, + type = "character", + allowed.number = 1 + ), + custom.event.timestamps.locked = list( + default = FALSE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 ) ), @@ -580,7 +594,7 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, ## construct file name for configuration conf.file = private$construct.conf.path(data, selection.process, casestudy, tagging) - ## load case-study confuration from given file + ## load case-study configuration from given file logging::loginfo("Attempting to load configuration file: %s", conf.file) conf = yaml::yaml.load_file(conf.file) @@ -604,7 +618,7 @@ ProjectConf = R6::R6Class("ProjectConf", inherit = Conf, conf$datapath.gender = private$get.results.folder(data, selection.process, casestudy, "gender") ## store path to issue data conf$datapath.issues = private$get.results.folder(data, selection.process, casestudy, tagging, subfolder = tagging) - + ## READ REVISIONS META-DATA ## read revisions file diff --git a/util-data-misc.R b/util-data-misc.R index a9cec28b..6d0d1803 100644 --- a/util-data-misc.R +++ b/util-data-misc.R @@ -19,6 +19,7 @@ ## Copyright 2019 by Jakob Kronawitter ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Christian Hechtl +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -77,12 +78,14 @@ mask.pull.requests = function(issue.data) { #' @param type which issue type to consider. #' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} #' [default: "all"] +#' @param use.unfiltered.data whether to use the unfiltered issue data, i.e. \code{proj.data$get.issues.unfiltered()} +#' instead of \code{proj.data$get.issues()} [default: FALSE] #' #' @return a filtered sub-data frame of the unfiltered issue data from \code{proj.data}. preprocess.issue.data = function(proj.data, retained.cols = c("author.name", "issue.id", "event.name"), - type = c("all", "pull.requests", "issues")) { + type = c("all", "pull.requests", "issues"), use.unfiltered.data = FALSE) { type = match.arg(type) - df = proj.data$get.issues.unfiltered() + df = if (use.unfiltered.data) proj.data$get.issues.unfiltered() else proj.data$get.issues() ## forall vectors k, if nrow(df) == 0, then df[k, ..] fails ## so we abort beforehand @@ -359,13 +362,14 @@ get.author.mail.thread.count = function(proj.data) { #' @param type which issue type to consider (see \code{preprocess.issue.data}). #' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} #' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding #' their respective issue counts -get.author.issue.count = function(proj.data, type = c("all", "issues", "pull.requests")) { +get.author.issue.count = function(proj.data, type = c("all", "issues", "pull.requests"), use.unfiltered.data = FALSE) { type = match.arg(type) logging::logdebug("get.author.issue.count: starting.") - df = preprocess.issue.data(proj.data, type = type) + df = preprocess.issue.data(proj.data, type = type, use.unfiltered.data = use.unfiltered.data) ## count distinct since an author may appear in the same issue multiple times stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" @@ -383,13 +387,17 @@ get.author.issue.count = function(proj.data, type = c("all", "issues", "pull.req #' @param type which issue type to consider (see \code{preprocess.issue.data}). #' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} #' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}). Note that the +#' filtered data may not contain issue created events. +#' [default: TRUE] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding #' their respective issue counts -get.author.issues.created.count = function(proj.data, type = c("all", "issues", "pull.requests")) { +get.author.issues.created.count = function(proj.data, type = c("all", "issues", "pull.requests"), + use.unfiltered.data = TRUE) { type = match.arg(type) logging::logdebug("get.author.issues.created.count: starting.") - df = preprocess.issue.data(proj.data, type = type) + df = preprocess.issue.data(proj.data, type = type, use.unfiltered.data = use.unfiltered.data) ## count distinct since an author may appear in the same issue multiple times stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` WHERE `event.name` = 'created' @@ -408,13 +416,15 @@ get.author.issues.created.count = function(proj.data, type = c("all", "issues", #' @param type which issue type to consider (see \code{preprocess.issue.data}). #' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} #' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding #' their respective issue counts -get.author.issues.commented.in.count = function(proj.data, type = c("all", "issues", "pull.requests")) { +get.author.issues.commented.in.count = function(proj.data, type = c("all", "issues", "pull.requests"), + use.unfiltered.data = FALSE) { type = match.arg(type) logging::logdebug("get.author.issues.commented.in.count: starting.") - df = preprocess.issue.data(proj.data, type = type) + df = preprocess.issue.data(proj.data, type = type, use.unfiltered.data = use.unfiltered.data) ## count distinct since an author may appear in the same issue multiple times stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` WHERE `event.name` = 'commented' @@ -433,13 +443,15 @@ get.author.issues.commented.in.count = function(proj.data, type = c("all", "issu #' @param type which issue type to consider (see \code{preprocess.issue.data}). #' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} #' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding #' their respective comment counts -get.author.issue.comment.count = function(proj.data, type = c("all", "issues", "pull.requests")) { +get.author.issue.comment.count = function(proj.data, type = c("all", "issues", "pull.requests"), + use.unfiltered.data = FALSE) { type = match.arg(type) logging::logdebug("get.author.issue.comment.count: starting.") - df = preprocess.issue.data(proj.data, type = type) + df = preprocess.issue.data(proj.data, type = type, use.unfiltered.data = use.unfiltered.data) stmt = "SELECT `author.name`, COUNT(*) as `freq` FROM `df` WHERE `event.name` = 'commented' GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" @@ -447,3 +459,311 @@ get.author.issue.comment.count = function(proj.data, type = c("all", "issues", " logging::logdebug("get.author.issue.comment.count: finished") return(res) } + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Mail Thread Statistics -------------------------------------------------- + +#' Get the number of contributors to each mail thread based on the mail data contained in the +#' specified \code{ProjectData}. +#' +#' @param proj.data the \code{ProjectData} containing the mail data +#' +#' @return a named list of contributor counts, where the name is the thread. +get.mail.thread.contributor.count = function(proj.data) { + logging::logdebug("get.mail.thread.contributor.count: starting.") + thread.to.mails = get.key.to.value.from.df(proj.data$get.mails(), "thread", "author.email") + thread.to.contributor.count = lapply(thread.to.mails, function(df) { + length(unique(df[["data.vertices"]])) + }) + logging::logdebug("get.mail.thread.contributor.count: finished") + return(thread.to.contributor.count) +} + +#' Get the number of messages in each mail thread based on the mail data contained in the +#' specified \code{ProjectData}. +#' +#' @param proj.data the \code{ProjectData} containing the mail data +#' +#' @return a named list of message counts, where the name is the thread. +get.mail.thread.mail.count = function(proj.data) { + logging::logdebug("get.mail.thread.mail.count: starting.") + thread.to.mails = get.key.to.value.from.df(proj.data$get.mails(), "thread", "author.email") + thread.to.mail.count = lapply(thread.to.mails, function(df) { + length(df[["data.vertices"]]) + }) + logging::logdebug("get.mail.thread.mail.count: finished") + return(thread.to.mail.count) +} + +#' Get the date of the first message in each mail thread based on the mail data contained +#' in the specified \code{ProjectData}. +#' +#' @param proj.data the \code{ProjectData} containing the mail data +#' +#' @return a named list of start dates, where the name is the thread. +get.mail.thread.start.date = function(proj.data) { + logging::logdebug("get.mail.thread.start.date: starting.") + thread.to.dates = get.key.to.value.from.df(proj.data$get.mails(), "thread", "date") + thread.to.start.date = lapply(thread.to.dates, function(df) { + min(df[["data.vertices"]]) + }) + logging::logdebug("get.mail.thread.start.date: finished") + return(thread.to.start.date) +} + +#' Get the date of the last message in each mail thread based on the mail data contained +#' in the specified \code{ProjectData} +#' +#' @param proj.data the \code{ProjectData} containing the mail data +#' +#' @return a named list of end dates, where the name is the thread. +get.mail.thread.end.date = function(proj.data) { + logging::logdebug("get.mail.thread.end.date: starting.") + thread.to.dates = get.key.to.value.from.df(proj.data$get.mails(), "thread", "date") + thread.to.end.date = lapply(thread.to.dates, function(df) { + max(df[["data.vertices"]]) + }) + logging::logdebug("get.mail.thread.end.date: finished") + return(thread.to.end.date) +} + +#' Get the identifier of the mailing list from which a threat originates. +#' This identifier is part of the thread ID as produced by codeface, e.g., if the thread ID is "13#37", then 13 is the +#' ID of the mailing list. +#' +#' Older versions of codeface did not include this identifier. If the identifier is not included in the data used, a +#' warning is produced and the list will contain \code{NA} for each thread. +#' +#' @param proj.data the \code{ProjectData} containing the mail data +#' +#' @return a named list of mailing list identifiers, where the name is the thread. +get.mail.thread.originating.mailing.list = function(proj.data) { + logging::logdebug("get.mail.thread.originating.mailing.list: starting.") + thread.ids = unique(proj.data$get.mails()[["thread"]]) + thread.to.list = lapply(thread.ids, function(thread.name) { + thread.id = substr(thread.name, 9, nchar(thread.name) - 1) # remove '' + if (grepl("#", thread.id, fixed = TRUE)) { # make sure that our data has the shape we expect + mailing.list = strsplit(thread.id, "#")[[1]][1] # split at '#' and keep only first part + return(mailing.list) + } + else { + logging::logwarn("get.mail.thread.originating.mailing.list called on incompatible data") + return(NA) + } + }) + names(thread.to.list) = thread.ids + logging::logdebug("get.mail.thread.originating.mailing.list: finished") + return(thread.to.list) +} + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Issue Statistics -------------------------------------------------------- + +#' Get the number of contributors to each issue based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return a named list of contributor counts, where the name is the issue ID. +get.issue.contributor.count = function(proj.data, type = c("all", "issues", "pull.requests"), + use.unfiltered.data = FALSE) { + type = match.arg(type) + logging::logdebug("get.issue.contributor.count: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "author.email"), + use.unfiltered.data = use.unfiltered.data) + issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "author.email") + issue.id.to.contributor.count = lapply(issue.id.to.events, function(df) { + length(unique(df[["data.vertices"]])) + }) + logging::logdebug("get.issue.contributor.count: finished") + return(issue.id.to.contributor.count) +} + +#' Get the number of events for each issue based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return a named list of event counts, where the name is the issue ID. +get.issue.event.count = function(proj.data, type = c("all", "issues", "pull.requests"), use.unfiltered.data = FALSE) { + type = match.arg(type) + logging::logdebug("get.issue.event.count: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "event.id"), + use.unfiltered.data = use.unfiltered.data) + issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "event.id") + issue.id.to.event.count = lapply(issue.id.to.events, function(df) { + ## one event might show up multiple times (i.e. 'mentioned' also triggers 'subscribed'), + ## so we count the number of distinct event IDs + length(unique(df[["data.vertices"]])) + }) + logging::logdebug("get.issue.event.count: finished") + return(issue.id.to.event.count) +} + +#' Get the number of 'commented' events for each issue based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' +#' @return a named list of comment counts, where the name is the issue ID. +get.issue.comment.count = function(proj.data, type = c("all", "issues", "pull.requests")) { + type = match.arg(type) + logging::logdebug("get.issue.comment.count: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "event.name")) + issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "event.name") + issue.id.to.comment.count = lapply(issue.id.to.events, function(df) { + event.names = df[["data.vertices"]] + return (length(event.names[event.names == "commented"])) + }) + logging::logdebug("get.issue.comment.count: finished") + return(issue.id.to.comment.count) +} + +#' Get the date each issue was opened, based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' +#' @return a named list of dates, where the name is the issue ID. +get.issue.opened.date = function(proj.data, type = c("all", "issues", "pull.requests")) { + type = match.arg(type) + logging::logdebug("get.issue.opened.date: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "creation.date")) + issue.id.to.dates = get.key.to.value.from.df(df, "issue.id", "creation.date") + issue.id.to.start.date = lapply(issue.id.to.dates, function(df) { + min(df[["data.vertices"]]) # values should all be the same + }) + logging::logdebug("get.issue.opened.date: finished") + return(issue.id.to.start.date) +} + +#' Get the date each issue was closed, based on the issue data contained +#' in the specified \code{ProjectData}, or \code{NA} if the issue is still open. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' +#' @return a named list of dates, where the name is the issue ID. +get.issue.closed.date = function(proj.data, type = c("all", "issues", "pull.requests")) { + type = match.arg(type) + logging::logdebug("get.issue.closed.date: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "closing.date")) + issue.id.to.dates = get.key.to.value.from.df(df, "issue.id", "closing.date") + issue.id.to.closed.date = lapply(issue.id.to.dates, function(df) { + min(df[["data.vertices"]]) # values should all be the same + }) + logging::logdebug("get.issue.closed.date: finished") + return(issue.id.to.closed.date) +} + +#' Get the date of the last activity in each issue based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return a named list of dates, where the name is the issue ID. +get.issue.last.activity.date = function(proj.data, type = c("all", "issues", "pull.requests"), + use.unfiltered.data = FALSE) { + type = match.arg(type) + logging::logdebug("get.issue.last.activity.date: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "date"), + use.unfiltered.data = use.unfiltered.data) + issue.id.to.dates = get.key.to.value.from.df(df, "issue.id", "date") + issue.id.to.end.date = lapply(issue.id.to.dates, function(df) { + max(df[["data.vertices"]]) + }) + logging::logdebug("get.issue.last.activity.date: finished") + return(issue.id.to.end.date) +} + +#' Get the title of each issue based on the issue data contained +#' in the specified \code{ProjectData}. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' +#' @return a named list of dates, where the name is the issue ID. +get.issue.title = function(proj.data, type = c("all", "issues", "pull.requests")) { + type = match.arg(type) + logging::logdebug("get.issue.title: starting.") + df = preprocess.issue.data(proj.data, type = type, retained.cols = c("issue.id", "issue.title")) + issue.id.to.title = get.key.to.value.from.df(df, "issue.id", "issue.title") + issue.id.to.title.only = lapply(issue.id.to.title, function(df) { + ## as a result of get.key.to.value.from.df, the "issue.title" column should be duplicated as "data.vertices". + ## The title should be the same in every row, so we can just use the first row. + df[[1,"data.vertices"]] # data frames resulting from get.key.to.value.from.df always have at least one row + }) + logging::logdebug("get.issue.title: finished") + return(issue.id.to.title.only) +} + +#' Get whether a PR is open, has been merged, or has been closed without merging. +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: TRUE] +#' +#' @return a named list of dates, where the name is the issue ID. +get.pr.open.merged.or.closed = function(proj.data, use.unfiltered.data = TRUE) { + logging::logdebug("get.pr.open.merged.or.closed: starting.") + df = preprocess.issue.data(proj.data, type = "pull.requests", use.unfiltered.data = use.unfiltered.data, + retained.cols = c("issue.id", "issue.state", "event.name")) + issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "event.name") + issue.id.to.state = lapply(issue.id.to.events, function(df) { + return (if ("open" %in% df[["issue.state"]] || "reopened" %in% df[["issue.state"]]) "open" + else if ("merged" %in% df[["event.name"]]) "merged" + else "closed") + }) + logging::logdebug("get.pr.open.merged.or.closed: finished") + return(issue.id.to.state) +} + +#' Get whether each issue is a pull request, based on the issue data contained in the specified +#' \code{ProjectData}. +#' +#' @param proj.data the \code{ProjectData} containing the issue data +#' +#' @return a named list of logical values, where the name is the issue ID. +get.issue.is.pull.request = function(proj.data) { + logging::logdebug("get.issue.is.pull.request: starting.") + issue.data = proj.data$get.issues() + issue.id.to.is.pr = as.list(mask.pull.requests(issue.data)) + names(issue.id.to.is.pr) = issue.data[["issue.id"]] + logging::logdebug("get.issue.is.pull.request: finished") + return(issue.id.to.is.pr) +} diff --git a/util-data.R b/util-data.R index 8304af71..e4025c75 100644 --- a/util-data.R +++ b/util-data.R @@ -24,6 +24,7 @@ ## Copyright 2020-2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -69,11 +70,12 @@ DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION = list( ) ## Yields the getters associated with additional data sources, e.g. author data. DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION = list( - "authors" = "get.authors", - "commit.messages" = "get.commit.messages", - "synchronicity" = "get.synchronicity", - "pasta" = "get.pasta", - "gender" = "get.gender" + "authors" = "get.authors", + "commit.messages" = "get.commit.messages", + "synchronicity" = "get.synchronicity", + "pasta" = "get.pasta", + "gender" = "get.gender", + "custom.event.timestamps" = "get.custom.event.timestamps" ) #' Applies a function to list keys @@ -117,7 +119,9 @@ CONF.PARAMETERS.NO.RESET.ENVIRONMENT = c("commit.messages", "synchronicity.time.window", "commits.locked", "issues.locked", - "mails.locked") + "mails.locked", + "custom.event.timestamps", + "custom.event.timestamps.locked") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -173,6 +177,8 @@ ProjectData = R6::R6Class("ProjectData", pasta.commits = create.empty.pasta.list(), # data.frame ## timestamps of mail, issue and commit data data.timestamps = data.frame(start = numeric(0), end = numeric(0)), # data.frame + ## custom timestamps for splitting + custom.event.timestamps = list(), # list ## * * commit filtering -------------------------------------------- @@ -514,51 +520,44 @@ ProjectData = R6::R6Class("ProjectData", update.pasta.commit.data = function() { logging::logdebug("update.pasta.commit.data: starting.") - ## return immediately if no commits available - if (self$is.data.source.cached("commits.unfiltered")) { - - ## remove previous PaStA data - private$commits.unfiltered["pasta"] = NULL - private$commits.unfiltered["revision.set.id"] = NULL + ## remove previous PaStA data + private$commits.unfiltered["pasta"] = NULL + private$commits.unfiltered["revision.set.id"] = NULL - ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case - ## we want to just remove the columns above) - if (private$project.conf$get.value("pasta")) { - ## merge PaStA data - private$commits.unfiltered = merge(private$commits.unfiltered, private$pasta.commits, - by = "hash", all.x = TRUE, sort = FALSE) + ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case + ## we want to just remove the columns above) + if (private$project.conf$get.value("pasta")) { + ## merge PaStA data + private$commits.unfiltered = merge(private$commits.unfiltered, private$pasta.commits, + by = "hash", all.x = TRUE, sort = FALSE) - ## sort by date again because 'merge' disturbs the order - private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], decreasing = FALSE), ] + ## sort by date again because 'merge' disturbs the order + private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], decreasing = FALSE), ] - ## remove duplicated revision set ids - private$commits.unfiltered[["revision.set.id"]] = sapply(private$commits.unfiltered[["revision.set.id"]], function(rev.id) { - return(unique(rev.id)) - }) - } + ## remove duplicated revision set ids + private$commits.unfiltered[["revision.set.id"]] = lapply(private$commits.unfiltered[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } - if (self$is.data.source.cached("commits")) { - - ## remove previous PaStA data - private$commits["pasta"] = NULL - private$commits["revision.set.id"] = NULL + ## remove previous PaStA data + private$commits["pasta"] = NULL + private$commits["revision.set.id"] = NULL - ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case - ## we want to just remove the columns above) - if (private$project.conf$get.value("pasta")) { - ## merge PaStA data - private$commits = merge(private$commits, private$pasta.commits, - by = "hash", all.x = TRUE, sort = FALSE) + ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case + ## we want to just remove the columns above) + if (private$project.conf$get.value("pasta")) { + ## merge PaStA data + private$commits = merge(private$commits, private$pasta.commits, + by = "hash", all.x = TRUE, sort = FALSE) - ## sort by date again because 'merge' disturbs the order - private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] + ## sort by date again because 'merge' disturbs the order + private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] - ## remove duplicated revision set ids - private$commits[["revision.set.id"]] = sapply(private$commits[["revision.set.id"]], function(rev.id) { - return(unique(rev.id)) - }) - } + ## remove duplicated revision set ids + private$commits[["revision.set.id"]] = lapply(private$commits[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } logging::logdebug("update.pasta.commit.data: finished.") @@ -569,52 +568,44 @@ ProjectData = R6::R6Class("ProjectData", update.pasta.mail.data = function() { logging::logdebug("update.pasta.mail.data: starting.") - ## return immediately if no mails available - if (self$is.data.source.cached("mails.unfiltered")) { - - ## remove previous PaStA data - private$mails.unfiltered["pasta"] = NULL - private$mails.unfiltered["revision.set.id"] = NULL + ## remove previous PaStA data + private$mails.unfiltered["pasta"] = NULL + private$mails.unfiltered["revision.set.id"] = NULL - ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case - ## we want to just remove the columns above) - if (private$project.conf$get.value("pasta")) { - ## merge PaStA data - private$mails.unfiltered = merge(private$mails.unfiltered, private$pasta.mails, - by = "message.id", all.x = TRUE, sort = FALSE) + ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case + ## we want to just remove the columns above) + if (private$project.conf$get.value("pasta")) { + ## merge PaStA data + private$mails.unfiltered = merge(private$mails.unfiltered, private$pasta.mails, + by = "message.id", all.x = TRUE, sort = FALSE) - ## sort by date again because 'merge' disturbs the order - private$mails.unfiltered = private$mails.unfiltered[order(private$mails.unfiltered[["date"]], decreasing = FALSE), ] + ## sort by date again because 'merge' disturbs the order + private$mails.unfiltered = private$mails.unfiltered[order(private$mails.unfiltered[["date"]], decreasing = FALSE), ] - ## remove duplicated revision set ids - private$mails.unfiltered[["revision.set.id"]] = sapply(private$mails.unfiltered[["revision.set.id"]], function(rev.id) { - return(unique(rev.id)) - }) - } + ## remove duplicated revision set ids + private$mails.unfiltered[["revision.set.id"]] = lapply(private$mails.unfiltered[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } - ## the same block as above, but now for filtered mails - if (self$is.data.source.cached("mails")) { - - ## remove previous PaStA data - private$mails["pasta"] = NULL - private$mails["revision.set.id"] = NULL + ## remove previous PaStA data + private$mails["pasta"] = NULL + private$mails["revision.set.id"] = NULL - ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case - ## we want to just remove the columns above) - if (private$project.conf$get.value("pasta")) { - ## merge PaStA data - private$mails = merge(private$mails, private$pasta.mails, - by = "message.id", all.x = TRUE, sort = FALSE) + ## only merge new data if pasta has been configured (it could also be changed to 'FALSE' in which case + ## we want to just remove the columns above) + if (private$project.conf$get.value("pasta")) { + ## merge PaStA data + private$mails = merge(private$mails, private$pasta.mails, + by = "message.id", all.x = TRUE, sort = FALSE) - ## sort by date again because 'merge' disturbs the order - private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] + ## sort by date again because 'merge' disturbs the order + private$mails = private$mails[order(private$mails[["date"]], decreasing = FALSE), ] - ## remove duplicated revision set ids - private$mails[["revision.set.id"]] = sapply(private$mails[["revision.set.id"]], function(rev.id) { - return(unique(rev.id)) - }) - } + ## remove duplicated revision set ids + private$mails[["revision.set.id"]] = lapply(private$mails[["revision.set.id"]], function(rev.id) { + return(unique(rev.id)) + }) } logging::logdebug("update.pasta.mail.data: finished.") @@ -637,14 +628,10 @@ ProjectData = R6::R6Class("ProjectData", private$aggregate.pasta.data() ## update mail data by attaching PaStA data - if (self$is.data.source.cached("mails.unfiltered")) { - private$update.pasta.mail.data() - } + private$update.pasta.mail.data() ## update commit data by attaching PaStA data - if (self$is.data.source.cached("commits.unfiltered") ) { - private$update.pasta.commit.data() - } + private$update.pasta.commit.data() ## get the caller function as a string stacktrace = get.stacktrace(sys.calls()) @@ -672,36 +659,32 @@ ProjectData = R6::R6Class("ProjectData", ## update commit data by attaching synchronicity data ## do not check whether synchronicity is available in order to remove the columns if it is not - if (self$is.data.source.cached("commits.unfiltered")) { - ## remove previous synchronicity data - private$commits.unfiltered["synchronicity"] = NULL - - ## only merge new data if synchronicity has been configured (it could also be changed to 'FALSE' in - ## which case we want to just remove the columns above) - if (private$project.conf$get.value("synchronicity")) { - ## merge synchronicity data - private$commits.unfiltered = merge(private$commits.unfiltered, private$synchronicity, - by = "hash", all.x = TRUE, sort = FALSE) + ## remove previous synchronicity data + private$commits.unfiltered["synchronicity"] = NULL - ## sort by date again because 'merge' disturbs the order - private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], decreasing = FALSE), ] - } + ## only merge new data if synchronicity has been configured (it could also be changed to 'FALSE' in + ## which case we want to just remove the columns above) + if (private$project.conf$get.value("synchronicity")) { + ## merge synchronicity data + private$commits.unfiltered = merge(private$commits.unfiltered, private$synchronicity, + by = "hash", all.x = TRUE, sort = FALSE) + + ## sort by date again because 'merge' disturbs the order + private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], decreasing = FALSE), ] } - if (self$is.data.source.cached("commits")) { - ## remove previous synchronicity data - private$commits["synchronicity"] = NULL - - ## only merge new data if synchronicity has been configured (it could also be changed to 'FALSE' in - ## which case we want to just remove the columns above) - if (private$project.conf$get.value("synchronicity")) { - ## merge synchronicity data - private$commits = merge(private$commits, private$synchronicity, - by = "hash", all.x = TRUE, sort = FALSE) - - ## sort by date again because 'merge' disturbs the order - private$commits = private$commits[order(private$commits[["date"]], - decreasing = FALSE), ] - } + ## remove previous synchronicity data + private$commits["synchronicity"] = NULL + + ## only merge new data if synchronicity has been configured (it could also be changed to 'FALSE' in + ## which case we want to just remove the columns above) + if (private$project.conf$get.value("synchronicity")) { + ## merge synchronicity data + private$commits = merge(private$commits, private$synchronicity, + by = "hash", all.x = TRUE, sort = FALSE) + + ## sort by date again because 'merge' disturbs the order + private$commits = private$commits[order(private$commits[["date"]], + decreasing = FALSE), ] } ## get the caller function as a string @@ -887,6 +870,10 @@ ProjectData = R6::R6Class("ProjectData", private$update.synchronicity.data() } } + ## if the 'custom.event.timestamps.file' parameter has changed, we want to clear them to trigger a re-read. + if (entry == "custom.event.timestamps.file") { + self$clear.custom.event.timestamps() + } } }, @@ -930,6 +917,10 @@ ProjectData = R6::R6Class("ProjectData", private$update.synchronicity.data() } } + ## if the 'custom.event.timestamps.file' parameter has changed, we want to clear them to trigger a re-read. + if (c("custom.event.timestamps.file") %in% params) { + self$clear.custom.event.timestamps() + } } }, @@ -1115,7 +1106,7 @@ ProjectData = R6::R6Class("ProjectData", ## remove cached data for filtered commits as these need to be re-computed after ## changing the data - private$commits = NULL + private$commits = create.empty.commits.list() }, #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. @@ -1513,7 +1504,7 @@ ProjectData = R6::R6Class("ProjectData", private$authors = data ## add gender data if wanted if (private$project.conf$get.value("gender")) { - + ## if data are not read already, read them if (!self$is.data.source.cached("gender")) { ## get data (no assignment because we just want to trigger anything gender-related) @@ -1618,7 +1609,7 @@ ProjectData = R6::R6Class("ProjectData", } private$issues.unfiltered = data - private$issues = NULL + private$issues = create.empty.issues.list() }, #' Get the list of artifacts from the given \code{data.source} of the project. @@ -1718,7 +1709,8 @@ ProjectData = R6::R6Class("ProjectData", "authors" = "authors", "commit.messages" = "commit.messages", "synchronicity" = "synchronicity", - "pasta" = "pasta" + "pasta" = "pasta", + "custom.event.timestamps" = "custom.event.timestamps" ) ) sources = self$get.cached.data.sources.internal(source.type) @@ -1749,7 +1741,8 @@ ProjectData = R6::R6Class("ProjectData", ## define the data sources unfiltered.data.sources = c("commits.unfiltered", "mails.unfiltered", "issues.unfiltered") - additional.data.sources = c("authors", "commit.messages", "synchronicity", "pasta", "gender") + additional.data.sources = c("authors", "commit.messages", "synchronicity", "pasta", + "gender", "custom.event.timestamps") main.data.sources = c("issues", "commits", "mails") ## set the right data sources to look for according to the argument @@ -2109,6 +2102,46 @@ ProjectData = R6::R6Class("ProjectData", data = unique(data) return (data) + }, + + #' Get the list of custom event timestamps, + #' read from a file configured by the \code{custom.event.timestamps.file} + #' parameter in the project configuration. + #' + #' @return the list of custom event timestamps + get.custom.event.timestamps = function() { + if (!self$is.data.source.cached("custom.event.timestamps") + && !private$project.conf$get.value("custom.event.timestamps.locked")) { + + file.name = self$get.project.conf.entry("custom.event.timestamps.file") + if(is.na(file.name)) { + logging::logwarn("get.custom.event.timestamps: No file configured") + return (list()) + } + timestamps = read.custom.event.timestamps(self$get.data.path(), file.name) + self$set.custom.event.timestamps(timestamps) + } + return (private$custom.event.timestamps) + }, + + #' Set the list of custom event timestamps. + #' The list will be sorted. + #' + #' @param custom.event.timestamps the list of timestamps to set + set.custom.event.timestamps = function(custom.event.timestamps) { + if(length(custom.event.timestamps) != 0){ + private$custom.event.timestamps = custom.event.timestamps[ + order(unlist(get.date.from.string(custom.event.timestamps))) + ] + } + else { + private$custom.event.timestamps = custom.event.timestamps + } + }, + + #' Clear existing custom event timestamps, for example to cause them to be re-read from a file. + clear.custom.event.timestamps = function() { + private$custom.event.timestamps = list() } ) ) diff --git a/util-misc.R b/util-misc.R index c62cfc11..b161cf67 100644 --- a/util-misc.R +++ b/util-misc.R @@ -19,6 +19,7 @@ ## Copyright 2020-2021 by Thomas Bock ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2021 by Niklas Schneider +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -380,11 +381,7 @@ generate.date.sequence = function(start.date, end.date, by, length.out = NULL) { if (is.null(length.out)) { time.period = lubridate::duration(by) } else { - time.complete = lubridate::as.duration(lubridate::interval(start.date, end.date)) - time.period = time.complete / length.out - ## to avoid rounding differences, we round the time period up - ## (otherwise, we may end up with another unwanted date in the sequence) - time.period = ceiling(time.period) + time.period = get.time.period.by.amount(start.date, end.date, length.out) } ## convenience function for next step @@ -413,6 +410,22 @@ generate.date.sequence = function(start.date, end.date, by, length.out = NULL) { return(dates) } +#' Calculate the time period for splitting a time period into equally sized windows. +#' +#' @param start.date The start time as string or POSIXct object +#' @param end.date The end time as string or POSIXct object +#' @param amount The desired amount of windows +#' +#' @return The duration of a single window +get.time.period.by.amount = function(start.date, end.date, amount) { + time.complete = lubridate::as.duration(lubridate::interval(start.date, end.date)) + time.period = time.complete / amount + ## to avoid rounding differences, we round the time period up + ## (otherwise, we may end up with another unwanted date in the sequence) + time.period = ceiling(time.period) + return(time.period) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Range construction and handling ----------------------------------------- diff --git a/util-networks-covariates.R b/util-networks-covariates.R index a550663e..31dd7134 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -20,6 +20,7 @@ ## Copyright 2020 by Christian Hechtl ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Niklas Schneider +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -121,7 +122,7 @@ add.vertex.attribute = function(net.to.range.list, attr.name, default.value, com function(x) get.or.default(x, attrs.by.vertex.name, default.value)) ## simplify the list of attributes to a vector if all its elements are just vectors (not lists) - if (length(attributes) > 0 && !any(lapply(attributes, is.list))) { + if (length(attributes) > 0 && !any(sapply(attributes, is.list))) { attributes = unlist(attributes) } ## otherwise, the list of attributes contains lists, so we can only remove the outermost list @@ -203,7 +204,7 @@ add.vertex.attribute.count.helper = function(list.of.networks, project.data, nam #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.author = function(list.of.networks, project.data, name = "commit.count", +add.vertex.attribute.author.commit.count = function(list.of.networks, project.data, name = "commit.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), @@ -229,10 +230,12 @@ add.vertex.attribute.commit.count.author = function(list.of.networks, project.da #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.author.not.committer = function(list.of.networks, project.data, +add.vertex.attribute.author.commit.count.not.committer = function(list.of.networks, project.data, name = "commit.count.author.not.committer", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + aggregation.level = c("range", "cumulative", + "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), default.value = 0L) { nets.with.attr = add.vertex.attribute.count.helper( @@ -256,11 +259,13 @@ add.vertex.attribute.commit.count.author.not.committer = function(list.of.networ #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.committer = function(list.of.networks, project.data, name = "commit.count.committer", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete"), - default.value = 0L) { +add.vertex.attribute.author.commit.count.committer = function(list.of.networks, project.data, + name = "commit.count.committer", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + default.value = 0L) { nets.with.attr = add.vertex.attribute.count.helper( list.of.networks, project.data, name, aggregation.level, default.value, get.committer.commit.count, "committer.name" @@ -282,10 +287,12 @@ add.vertex.attribute.commit.count.committer = function(list.of.networks, project #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.committer.not.author = function(list.of.networks, project.data, +add.vertex.attribute.author.commit.count.committer.not.author = function(list.of.networks, project.data, name = "commit.count.committer.not.author", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + aggregation.level = c("range", "cumulative", + "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), default.value = 0L) { nets.with.attr = add.vertex.attribute.count.helper( @@ -309,10 +316,12 @@ add.vertex.attribute.commit.count.committer.not.author = function(list.of.networ #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.committer.and.author = function(list.of.networks, project.data, +add.vertex.attribute.author.commit.count.committer.and.author = function(list.of.networks, project.data, name = "commit.count.committer.and.author", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + aggregation.level = c("range", "cumulative", + "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), default.value = 0L) { nets.with.attr = add.vertex.attribute.count.helper( @@ -337,10 +346,12 @@ add.vertex.attribute.commit.count.committer.and.author = function(list.of.networ #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.commit.count.committer.or.author = function(list.of.networks, project.data, +add.vertex.attribute.author.commit.count.committer.or.author = function(list.of.networks, project.data, name = "commit.count.committer.or.author", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + aggregation.level = c("range", "cumulative", + "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), default.value = 0L) { nets.with.attr = add.vertex.attribute.count.helper( @@ -365,7 +376,7 @@ add.vertex.attribute.commit.count.committer.or.author = function(list.of.network #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.artifact.count = function(list.of.networks, project.data, name = "artifact.count", +add.vertex.attribute.author.artifact.count = function(list.of.networks, project.data, name = "artifact.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), @@ -403,7 +414,7 @@ add.vertex.attribute.artifact.count = function(list.of.networks, project.data, n #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.mail.count = function(list.of.networks, project.data, +add.vertex.attribute.author.mail.count = function(list.of.networks, project.data, name = "mail.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", @@ -430,7 +441,7 @@ add.vertex.attribute.mail.count = function(list.of.networks, project.data, #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' #' @return A list of networks with the added attribute -add.vertex.attribute.mail.thread.count = function(list.of.networks, project.data, +add.vertex.attribute.author.mail.thread.count = function(list.of.networks, project.data, name = "mail.thread.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", @@ -459,22 +470,27 @@ add.vertex.attribute.mail.thread.count = function(list.of.networks, project.data #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return A list of networks with the added attribute -add.vertex.attribute.issue.count = function(list.of.networks, project.data, +add.vertex.attribute.author.issue.count = function(list.of.networks, project.data, name = "issue.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), - default.value = 0L, issue.type = c("all", "pull.requests", "issues")) { + default.value = 0L, issue.type = c("all", "pull.requests", "issues"), + use.unfiltered.data = FALSE) { issue.type = match.arg(issue.type) - if (name == "issue.count" && identical(issue.type, "pull.requests")) { + if (missing(name) && identical(issue.type, "pull.requests")) { name = "pull.request.count" } nets.with.attr = add.vertex.attribute.count.helper( - list.of.networks, project.data, name, aggregation.level, - default.value, function(data) {return(get.author.issue.count(data, type = issue.type))}, "author.name" + list.of.networks, project.data, name, aggregation.level, default.value, + function(data) { + return(get.author.issue.count(data, type = issue.type,use.unfiltered.data = use.unfiltered.data)) + }, + "author.name" ) return(nets.with.attr) @@ -493,22 +509,30 @@ add.vertex.attribute.issue.count = function(list.of.networks, project.data, #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return A list of networks with the added attribute -add.vertex.attribute.issues.commented.count = function(list.of.networks, project.data, +add.vertex.attribute.author.issues.commented.count = function(list.of.networks, project.data, name = "issues.commented.count", aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), - default.value = 0L, issue.type = c("all", "pull.requests", "issues")) { + default.value = 0L, issue.type = c("all", "pull.requests", + "issues"), + use.unfiltered.data = FALSE) { issue.type = match.arg(issue.type) - if (name == "issues.commented.count" && identical(issue.type, "pull.requests")) { + if (missing(name) && identical(issue.type, "pull.requests")) { name = "pull.requests.commented.count" } nets.with.attr = add.vertex.attribute.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, function(data) {return(get.author.issues.commented.in.count(data, type = issue.type))}, "author.name" + default.value, function(data) { + return(get.author.issues.commented.in.count(data, type = issue.type, + use.unfiltered.data = use.unfiltered.data)) + }, + "author.name" ) return(nets.with.attr) @@ -527,28 +551,37 @@ add.vertex.attribute.issues.commented.count = function(list.of.networks, project #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}). +#' Note that filtered data may not contain issue creation events. +#' [default: TRUE] #' #' @return A list of networks with the added attribute -add.vertex.attribute.issue.creation.count = function(list.of.networks, project.data, +add.vertex.attribute.author.issue.creation.count = function(list.of.networks, project.data, name = "issue.creation.count", aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), - default.value = 0L, issue.type = c("all", "pull.requests", "issues")) { + default.value = 0L, issue.type = c("all", "pull.requests", + "issues"), + use.unfiltered.data = TRUE) { issue.type = match.arg(issue.type) - if (name == "issue.creation.count" && identical(issue.type, "pull.requests")) { + if (missing(name) && identical(issue.type, "pull.requests")) { name = "pull.request.creation.count" } nets.with.attr = add.vertex.attribute.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, function(data) {return(get.author.issues.created.count(data, type = issue.type))}, "author.name" + default.value, function(data) { + return(get.author.issues.created.count(data, type = issue.type, use.unfiltered.data = use.unfiltered.data)) + }, "author.name" ) return(nets.with.attr) } -#' Add issue-comments-count attribute based on the number of comments in issues, where the person represented by the vertex is the author. +#' Add issue-comments-count attribute based on the number of comments in issues, where the person represented by the +#' vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data @@ -560,22 +593,28 @@ add.vertex.attribute.issue.creation.count = function(list.of.networks, project.d #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] #' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] #' #' @return A list of networks with the added attribute -add.vertex.attribute.issue.comment.count = function(list.of.networks, project.data, +add.vertex.attribute.author.issue.comment.count = function(list.of.networks, project.data, name = "issue.comment.count", aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), - default.value = 0L, issue.type = c("all", "pull.requests", "issues")) { + default.value = 0L, issue.type = c("all", "pull.requests", + "issues"), + use.unfiltered.data = FALSE) { issue.type = match.arg(issue.type) - if (name == "issue.comment.count" && identical(issue.type, "pull.requests")) { + if (missing(name) && identical(issue.type, "pull.requests")) { name = "pull.request.comment.count" } nets.with.attr = add.vertex.attribute.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, function(data) {return(get.author.issue.comment.count(data, type = issue.type))}, "author.name" + default.value, function(data) { + return(get.author.issue.comment.count(data, type = issue.type, use.unfiltered.data = use.unfiltered.data)) + }, "author.name" ) return(nets.with.attr) @@ -591,7 +630,8 @@ add.vertex.attribute.issue.comment.count = function(list.of.networks, project.da #' @param default.value The default value to add if a vertex has no matching value [default: NA] #' #' @return A list of networks with the added attribute -add.vertex.attribute.author.email = function(list.of.networks, project.data, name = "author.email", default.value = NA) { +add.vertex.attribute.author.email = function(list.of.networks, project.data, name = "author.email", + default.value = NA) { nets.with.attr = split.and.add.vertex.attribute( list.of.networks, project.data, name, "complete", default.value, function(range, range.data, net) { @@ -627,7 +667,7 @@ add.vertex.attribute.author.email = function(list.of.networks, project.data, nam #' [default: FALSE] #' #' @return A list of networks with the added attribute. -add.vertex.attribute.first.activity = function(list.of.networks, project.data, +add.vertex.attribute.author.first.activity = function(list.of.networks, project.data, activity.types = c("mails", "commits", "issues"), name = "first.activity", aggregation.level = c("range", "cumulative", "all.ranges", @@ -669,8 +709,8 @@ add.vertex.attribute.first.activity = function(list.of.networks, project.data, return(data) } - nets.with.attr = split.and.add.vertex.attribute(list.of.networks, project.data, name, aggregation.level, vertex.default, - compute.attr, list.attributes = TRUE) + nets.with.attr = split.and.add.vertex.attribute(list.of.networks, project.data, name, aggregation.level, + vertex.default, compute.attr, list.attributes = TRUE) return(nets.with.attr) } @@ -690,7 +730,7 @@ add.vertex.attribute.first.activity = function(list.of.networks, project.data, #' [default: FALSE] #' #' @return A list of networks with the added attribute -add.vertex.attribute.active.ranges = function(list.of.networks, project.data, name = "active.ranges", +add.vertex.attribute.author.active.ranges = function(list.of.networks, project.data, name = "active.ranges", activity.types = c("mails", "commits", "issues"), default.value = list(), combine.activity.types = FALSE) { @@ -860,7 +900,7 @@ add.vertex.attribute.author.role = function(list.of.networks, classification.res ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Artifact network functions ---------------------------------------------- -## * Change count ---------------------------------------------------------- +## * Commit-based metrics -------------------------------------------------- #' Add the count of unique editors (i.e., authors) that worked on an artifact #' @@ -889,7 +929,8 @@ add.vertex.attribute.artifact.editor.count = function(list.of.networks, project. if (missing(editor.definition)) { editor.definition = "author" } else { - editor.definition = match.arg.or.default(editor.definition, choices = c("author", "committer"), several.ok = TRUE) + editor.definition = match.arg.or.default(editor.definition, choices = c("author", "committer"), + several.ok = TRUE) } editor.definition = paste0(editor.definition, ".name") @@ -944,8 +985,6 @@ add.vertex.attribute.artifact.change.count = function(list.of.networks, project. return(nets.with.attr) } -## * Activity -------------------------------------------------------------- - #' Add the first occurrence of the artifact #' #' @param list.of.networks The network list @@ -961,7 +1000,8 @@ add.vertex.attribute.artifact.change.count = function(list.of.networks, project. #' @return A list of networks with the added attribute add.vertex.attribute.artifact.first.occurrence = function(list.of.networks, project.data, name = "first.occurrence", aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), default.value = NA) { aggregation.level = match.arg.or.default(aggregation.level, default = "complete") @@ -982,6 +1022,525 @@ add.vertex.attribute.artifact.first.occurrence = function(list.of.networks, proj return(nets.with.attr) } +#' Add the date of the last edit of the artifact +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "last.edited"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.artifact.last.edited = function(list.of.networks, project.data, name = "last.edited", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + artifact.to.dates = get.key.to.value.from.df(range.data$get.commits(), "artifact", "date") + artifact.to.last = lapply(artifact.to.dates, function(a) { + max(a[["date"]]) + }) + return(artifact.to.last) + } + ) + return(nets.with.attr) +} + +## * Mail thread metrics --------------------------------------------------- + +#' Add the number of contributors to each mail thread +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "thread.contributor.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.mail.thread.contributor.count = function(list.of.networks, project.data, + name = "thread.contributor.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.mail.thread.contributor.count(range.data)) + } + ) + return(nets.with.attr) +} + +#' Add the number of messages in each mail thread +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "thread.message.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.mail.thread.message.count = function(list.of.networks, project.data, name = "thread.message.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.mail.thread.mail.count(range.data)) + } + ) + return(nets.with.attr) +} + +#' Add the date of the first message in each mail thread +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "thread.start.date"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.mail.thread.start.date = function(list.of.networks, project.data, name = "thread.start.date", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.mail.thread.start.date(range.data)) + } + ) + return(nets.with.attr) +} + +#' Add the date of the first message in each mail thread +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "thread.end.date"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.mail.thread.end.date = function(list.of.networks, project.data, name = "thread.end.date", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.mail.thread.end.date(range.data)) + } + ) + return(nets.with.attr) +} + +#' Add the identifier of the mailing list where a mail thread originates. +#' See \code{get.mail.thread.originating.mailing.list} for more details. +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "thread.originating.mailing.list"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.mail.thread.originating.mailing.list = function(list.of.networks, project.data, + name = "thread.originating.mailing.list", + aggregation.level = c("range", "cumulative", + "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.mail.thread.originating.mailing.list(range.data)) + } + ) + return(nets.with.attr) +} + +## * Issue metrics --------------------------------------------------------- + +#' Add the number of contributors to each issue or PR +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.contributor.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.contributor.count = function(list.of.networks, project.data, + name = "issue.contributor.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), + type = c("all", "issues", "pull.requests"), + default.value = NA, use.unfiltered.data = FALSE) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.contributor.count" + } + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.contributor.count(range.data, type = type, use.unfiltered.data = use.unfiltered.data)) + } + ) + return(nets.with.attr) +} + +#' Add the number of events for each issue or PR +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.event.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.event.count = function(list.of.networks, project.data, name = "issue.event.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + type = c("all", "issues", "pull.requests"), default.value = NA, + use.unfiltered.data = FALSE) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.event.count" + } + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.event.count(range.data, type = type, use.unfiltered.data = use.unfiltered.data)) + } + ) + return(nets.with.attr) +} + +#' Add the number of comment events for each issue or PR +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.comment.event.count"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.comment.event.count = function(list.of.networks, project.data, + name = "issue.comment.event.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", + "complete"), + type = c("all", "issues", "pull.requests"), + default.value = NA) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.comment.event.count" + } + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.comment.count(range.data, type = type)) + } + ) + return(nets.with.attr) +} + +#' Add the date each issue or PR was opened +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.opened.date"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.opened.date = function(list.of.networks, project.data, name = "issue.opened.date", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + type = c("all", "issues", "pull.requests"), default.value = NA) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.opened.date" + } + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.opened.date(range.data, type = type)) + } + ) + return(nets.with.attr) +} + +#' Add the date each issue or PR was closed, or NA if it was not yet closed. +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.closed.date"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.closed.date = function(list.of.networks, project.data, name = "issue.closed.date", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + type = c("all", "issues", "pull.requests"), default.value = NA) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.closed.date" + } + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.closed.date(range.data, type = type)) + } + ) + return(nets.with.attr) +} + +#' Add the date of the last activity in each issue or PR +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.last.activity"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' @param use.unfiltered.data whether to use unfiltered issue data (see \code{preprocess.issue.data}) [default: FALSE] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.last.activity.date = function(list.of.networks, project.data, name = "issue.last.activity", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), + type = c("all", "issues", "pull.requests"), + default.value = NA, use.unfiltered.data = FALSE) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.last.activity" + } + + ## make sure that the default value contains a tzone attribute (even if the default value is NA) + default.value = get.date.from.string(default.value) + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.last.activity.date(range.data, type = type, use.unfiltered.data = use.unfiltered.data)) + } + ) + return(nets.with.attr) +} + +#' Add the title of each issue or PR +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.title"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param type which issue type to consider (see \code{preprocess.issue.data}). +#' One of \code{"issues"}, \code{"pull.requests"} or \code{"all"} +#' [default: "all"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.title = function(list.of.networks, project.data, name = "issue.title", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + type = c("all", "issues", "pull.requests"), default.value = NA) { + type = match.arg(type) + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + if (missing(name) && identical(type, "pull.requests")) { + name = "pr.title" + } + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.title(range.data, type = type)) + } + ) + return(nets.with.attr) +} + +#' Add whether each PR is open, has been merged, or was closed without merging. +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "pull.request.state"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.pr.open.merged.or.closed = function(list.of.networks, project.data, name = "pull.request.state", + default.value = NA) { + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level = "complete", default.value, + function(range, range.data, net) { + return(get.pr.open.merged.or.closed(range.data)) + } + ) + return(nets.with.attr) +} + +#' Add whether an issue is a pull request +#' +#' @param list.of.networks The network list +#' @param project.data The project data +#' @param name The attribute name to add [default: "issue.is.pull.request"] +#' @param aggregation.level Determines the data to use for the attribute calculation. +#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, +#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and +#' \code{"complete"}. See \code{split.data.by.networks} for +#' more details. [default: "complete"] +#' @param default.value The default value to add if a vertex has no matching value [default: NA] +#' +#' @return A list of networks with the added attribute +add.vertex.attribute.issue.is.pull.request = function(list.of.networks, project.data, name = "issue.is.pull.request", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", + "project.all.ranges", "complete"), + default.value = NA) { + aggregation.level = match.arg.or.default(aggregation.level, default = "complete") + + nets.with.attr = split.and.add.vertex.attribute( + list.of.networks, project.data, name, aggregation.level, default.value, + function(range, range.data, net) { + return(get.issue.is.pull.request(range.data)) + } + ) + return(nets.with.attr) +} + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Helper ------------------------------------------------------------------ @@ -989,7 +1548,8 @@ add.vertex.attribute.artifact.first.occurrence = function(list.of.networks, proj #' #' @param activity.types The activity types to compute information for. [default: c("mails", "commits", "issues")] #' @param range.data The data to base the computation on. -#' @param default.value The default value to add if no information is available per author and activity type. [default: NA] +#' @param default.value The default value to add if no information is available per author and activity type. +#' [default: NA] #' #' @return A list containing per author a list of first activity values named with the corresponding activity type. #' Empty list if there are no activities in \code{range.data} at all or none corresponding to the configured @@ -1106,11 +1666,13 @@ get.first.activity.data = function(range.data, activity.types = c("commits", "ma #' #' @param activity.types The activity types to compute information for. [default: c("mails", "commits", "issues")] #' @param net.to.range.list The data to base the computation on, split by networks. -#' @param default.value The default value to add if no information is available per author and activity type. [default: list()] +#' @param default.value The default value to add if no information is available per author and activity type. +#' [default: list()] #' #' @return A list with elements representing the authors, each containing a list of elements representing the activity #' types, each containing a list of active ranges. -get.active.ranges.data = function(activity.types = c("mails", "commits", "issues"), net.to.range.list, default.value = list()) { +get.active.ranges.data = function(activity.types = c("mails", "commits", "issues"), net.to.range.list, + default.value = list()) { ## a list with elements representing the parsed activity types, each containing a list of elements ## representing the ranges the data was split by, each containing a list of authors who were active @@ -1151,8 +1713,8 @@ get.active.ranges.data = function(activity.types = c("mails", "commits", "issues #' This function takes a nested list and switches the order of the nesting levels: the innermost level is moved to the -#' outside. This is done by reproducing the given list structure for every element occuring in one of the innermost lists -#' and then deleting every sublist in which the element does not occur. For example, on input +#' outside. This is done by reproducing the given list structure for every element occuring in one of the innermost +#' lists and then deleting every sublist in which the element does not occur. For example, on input #' #' type.range.author = list( #' "type1" = list( @@ -1186,7 +1748,8 @@ get.active.ranges.data = function(activity.types = c("mails", "commits", "issues #' ) #' #' @param nested.list A list nested AT LEAST ONCE, that means: the elements of the outermost list are also lists. The -#' nesting depth of all inner lists must be the same and the lists must be named at every nesting level. +#' nesting depth of all inner lists must be the same and the lists must be named at every nesting +#' level. #' #' @return The nested list with the innermost level as new outermost level. transpose.nested.list.by.innermost.values = function(nested.list) { @@ -1200,7 +1763,8 @@ transpose.nested.list.by.innermost.values = function(nested.list) { if (length(structure) == 0) { return(list()) - ## Base case 2: if the structure isn't nested itself, it is only returned, if it contains the given innerst.element. + ## Base case 2: if the structure isn't nested itself, it is only returned, if it contains the given + ## innerst.element. ## Otherwise, NA is returned. } else if (!is.list(structure[[1]])) { if (innerst.element %in% structure) { diff --git a/util-networks.R b/util-networks.R index 44abc7fc..5da3b4d5 100644 --- a/util-networks.R +++ b/util-networks.R @@ -20,6 +20,7 @@ ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Niklas Schneider +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -1171,6 +1172,9 @@ construct.network.from.edge.list = function(vertices, edge.list, network.conf, d #' Merges a list vertex data frame and merges a list of edge #' data frames #' +#' Note that identical vertices are merged, whereas identical edges are not. +#' This will lead to duplicated edges if you merge a network with itself. +#' #' @param vertex.data the list of vertex data frames, may be \code{NULL} #' @param edge.data the list of edge data frames, may be \code{NULL} #' @@ -1204,6 +1208,9 @@ merge.network.data = function(vertex.data, edge.data) { #' Merges a list of networks to one big network #' +#' Note that identical vertices are merged, whereas identical edges are not. +#' This will lead to duplicated edges if you merge a network with itself. +#' #' @param networks the list of networks #' #' @return the built one network @@ -1517,9 +1524,10 @@ extract.artifact.network.from.network = function(network, remove.isolates = FALS #' **Note**: This function throws an error when the edge attribute \code{type} is missing. #' #' @param network the (multi) network to reduce +#' @param remove.isolates whether to remove isolated vertices during extraction [default: FALSE] #' #' @return the bipartite-edge-induced subgraph of \code{network} -extract.bipartite.network.from.network = function(network) { +extract.bipartite.network.from.network = function(network, remove.isolates = FALSE) { ## check whether there are vertices in the network, otherwise skip the extraction if (igraph::vcount(network) == 0) { @@ -1533,7 +1541,7 @@ extract.bipartite.network.from.network = function(network) { } ## only retain all bipartite edges and induced vertices - bip.network = igraph::subgraph.edges(network, igraph::E(network)[type == TYPE.EDGES.INTER]) + bip.network = igraph::subgraph.edges(network, igraph::E(network)[type == TYPE.EDGES.INTER], delete.vertices = remove.isolates) return(bip.network) } diff --git a/util-read.R b/util-read.R index e17fe7af..1f0451d0 100644 --- a/util-read.R +++ b/util-read.R @@ -22,6 +22,7 @@ ## Copyright 2020-2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. ## Note: @@ -466,7 +467,7 @@ read.authors = function(data.path) { ## get file name of commit data file = file.path(data.path, "authors.list") - ## read data.frame from disk (as expected from save.list.to.file) [can be empty] + ## read data.frame from disk (as expected from save.list.to.file) authors.df = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, encoding = "UTF-8"), silent = TRUE) @@ -852,6 +853,43 @@ create.empty.synchronicity.list = function() { } +## * Custom timestamps for splitting + +#' Read custom event timestamps from a file in \code{.list} format. +#' +#' @param data.path the path of the directory containing the file +#' @param file.name the name of the file +#' +#' @return the read timestamps +read.custom.event.timestamps = function(data.path, file.name) { + logging::logdebug("read.custom.event.timestamps: starting.") + + file = file.path(data.path, file.name) + + ## read data.frame from disk (as expected from save.list.to.file) [can be empty] + custom.event.timestamps.table = try(read.table(file, header = FALSE, sep = ";", strip.white = TRUE, + encoding = "UTF-8"), silent = TRUE) + + ## handle the case that the list of commits is empty + if (inherits(custom.event.timestamps.table, "try-error")) { + logging::logwarn("There are no custom timestamps available at the given path.") + logging::logwarn("Datapath: %s", data.path) + + ## return an empty list + return(list()) + } + timestamps = as.list(custom.event.timestamps.table[[2]]) + names(timestamps) = custom.event.timestamps.table[[1]] + + ## Sort the timestamps + if (length(timestamps) != 0) { + timestamps = timestamps[order(unlist(get.date.from.string(timestamps)))] + } + + logging::logdebug("read.custom.event.timestamps: finished.") + return (timestamps) +} + ## Helper functions -------------------------------------------------------- ## declare a global format for the commit.id column in several data frames diff --git a/util-split.R b/util-split.R index 91c64bab..1c0ea9e9 100644 --- a/util-split.R +++ b/util-split.R @@ -21,6 +21,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert +## Copyright 2022 by Jonathan Baumann ## All Rights Reserved. @@ -44,11 +45,11 @@ requireNamespace("lubridate") # for date conversion #' @param time.period the time period describing the length of the ranges, a character string, #' e.g., "3 mins" or "15 days" [default: "3 months"] #' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an -#' *exclusive* manner). If set, the 'time.period' parameter is ignored; consequently, 'split.basis' and -#' 'sliding.window' do not make sense then either. [default: NULL] +#' *exclusive* manner). If set, the \code{time.period} parameter is ignored; consequently, \code{split.basis} and +#' \code{sliding.window} do not make sense then either. [default: NULL] #' @param number.windows the number of consecutive data objects to get from this function, implying equally -#' time-sized windows for all ranges. If set, the 'time.period' and 'bins' parameters are ignored; -#' consequently, 'split.basis' and 'sliding.window' do not make sense then either. +#' time-sized windows for all ranges. If set, the \code{time.period} and \code{bins} parameters are ignored; +#' consequently, \code{sliding.window} does not make sense then either. #' [default: NULL] #' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' #' [default: "commits"] @@ -91,7 +92,6 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = }) names(additional.data) = additional.data.sources - ## number of windows given (ignoring time period and bins) if (!is.null(number.windows)) { ## reset bins for the later algorithm @@ -215,7 +215,21 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = ## add splitting information to project configuration project.conf.new$set.splitting.info( type = "time-based", - length = if (split.by.bins) bins else time.period, + length = if (split.by.bins) { + bins + } + else { + if (!is.null(number.windows)) { + as.character(lubridate::as.period( + get.time.period.by.amount( + min(data[[split.basis]][["date"]]), + max(data[[split.basis]][["date"]]), + number.windows + ) + )) + } + else time.period + }, basis = split.basis, sliding.window = sliding.window, revisions = bins, @@ -229,6 +243,30 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = return(cf.data) } +#' Split project data by timestamps +#' +#' Splits project data into ranges, where the first range starts with the first timestamp +#' and the last range ends with the last timestamp. +#' +#' If timestamps are not provided, the custom event timestamps in \code{project.data} are +#' used instead. +#' +#' @param project.data the *Data object from which the data is retrieved +#' @param bins a vector of timestamps [default: NULL] +#' @param project.conf.new the new project config to construct the \code{RangeData} objects. +#' If \code{NULL}, a clone of \code{project.data$get.project.conf()} will be used. +#' [default: NULL] +#' +#' @return the list of RangeData objects, each referring to one time period +split.data.time.based.by.timestamps = function(project.data, bins = NULL, project.conf.new = NULL) { + + if (is.null(bins)) { # bins were not provided, use custom timestamps from project + bins = unlist(project.data$get.custom.event.timestamps()) + } + + return (split.data.time.based(project.data, bins = bins, project.conf.new)); +} + #' Split project data in activity-based ranges as specified #' #' Important: For a given amount of activity, the last set of data may be a lot smaller