-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbschema.sql
157 lines (139 loc) · 5.02 KB
/
dbschema.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
-- Noteworthy:
-- 1. Table and column names not quoted will be uppercased. Quote them to keep them lower.
-- In Snowflake, primary keys and unique keys are informational only;
-- they are not enforced but can be used by the query optimizer to improve performance.
-- When you define a primary key, you're providing Snowflake with information that can be
-- used to optimize query plans, even though the uniqueness of primary key values is not enforced.
CREATE TABLE IF NOT EXISTS "contributor_stats" (
"repo" VARCHAR(255),
"contributor_nodeid" VARCHAR(255),
"contributor_name" VARCHAR(255),
"contributor_username" VARCHAR(255),
"curved_score" FLOAT,
"stats_beginning" DATE,
"stats_ending" DATE,
"contributor_first_commit_date" DATE,
"num_workdays" INT,
"commits" INT,
"prs" INT,
"review_comments" INT,
"changed_lines" INT,
"avg_pr_duration" FLOAT,
"avg_code_movement_per_pr" FLOAT,
"commits_per_day" FLOAT,
"changed_lines_per_day" FLOAT,
"prs_per_day" FLOAT,
"review_comments_per_day" FLOAT,
"prs_diff_from_mean" FLOAT,
"prs_ntile" INT,
"commits_ntile" INT,
"lines_of_code_ntile" INT,
"review_comments_ntile" INT,
"avg_pr_duration_ntile" INT,
"avg_ntile" INT,
"median_pr_review_duration" FLOAT,
PRIMARY KEY ("contributor_nodeid", "repo", "stats_beginning")
);
-- This table is used to stage new ghstats storage
-- since the snowflake library we are using doesn't
-- support upserts. We store the entire dataframe
-- here, then use a merge operation to copy it into
-- the contributor_stats table
CREATE TABLE IF NOT EXISTS "contributor_stats_staging" (
"repo" VARCHAR(255),
"contributor_nodeid" VARCHAR(255),
"contributor_name" VARCHAR(255),
"contributor_username" VARCHAR(255),
"curved_score" FLOAT,
"stats_beginning" DATE,
"stats_ending" DATE,
"contributor_first_commit_date" DATE,
"num_workdays" INT,
"commits" INT,
"prs" INT,
"review_comments" INT,
"changed_lines" INT,
"avg_pr_duration" FLOAT,
"avg_code_movement_per_pr" FLOAT,
"commits_per_day" FLOAT,
"changed_lines_per_day" FLOAT,
"prs_per_day" FLOAT,
"review_comments_per_day" FLOAT,
"prs_diff_from_mean" FLOAT,
"prs_ntile" INT,
"commits_ntile" INT,
"lines_of_code_ntile" INT,
"review_comments_ntile" INT,
"avg_pr_duration_ntile" INT,
"avg_ntile" INT,
"median_pr_review_duration" FLOAT,
PRIMARY KEY ("contributor_nodeid", "repo", "stats_beginning")
);
CREATE TABLE IF NOT EXISTS "contributors" (
"contributor_nodeid" VARCHAR(255),
"contributor_name" VARCHAR(255),
"contributor_username" VARCHAR(255),
PRIMARY KEY ("contributor_nodeid")
);
-- repo_name - the name of the repository
-- stats_beginning - the beginning of the stats period
-- stats_ending - the end of the stats period
-- num_workdays - the number of workdays in the period
-- num_contributors - the number of contributors to the repo
-- avg_pr_duration - the average duration of a pull request across the repo during the period
-- median_pr_duration - the median duration of a pull request across the repo during the period
-- num_prs - the number of pull requests across the repo during the period
-- num_commits - the number of commits across the repo during the period
CREATE TABLE IF NOT EXISTS "repo_stats" (
"repo_name" VARCHAR(255),
"stats_beginning" DATE,
"stats_ending" DATE,
"num_workdays" INT,
"num_contributors" INT,
"avg_pr_duration" FLOAT,
"median_pr_duration" FLOAT,
"num_prs" INT,
"num_commits" INT,
PRIMARY KEY ("repo_name", "stats_beginning")
);
-- A repo can have multiple topics. A topic can belong to multiple repos.
CREATE TABLE if not EXISTS "repo_topics" (
"repo_name" VARCHAR(255),
"repo_topic" VARCHAR(255),
PRIMARY KEY ("repo_name", "repo_topic")
);
-- PR review comments stored to do AI-based analysis of them
CREATE TABLE IF NOT EXISTS "pr_review_comments" (
"comment_id" BIGINT PRIMARY KEY,
"repo_name" VARCHAR(256),
"pr_number" VARCHAR(64),
"user_login" VARCHAR(256),
"body" TEXT,
"created_at" TIMESTAMP_NTZ
);
-- Staging table for PR review comments so we can more efficiently merge
CREATE TABLE IF NOT EXISTS "pr_review_comments_staging" (
"comment_id" BIGINT PRIMARY KEY,
"repo_name" VARCHAR(256),
"pr_number" VARCHAR(64),
"user_login" VARCHAR(256),
"body" TEXT,
"created_at" TIMESTAMP_NTZ
);
CREATE TABLE IF NOT EXISTS "pr_review_comments_training" (
"comment_id" BIGINT PRIMARY KEY,
"repo_name" VARCHAR(256),
"pr_number" VARCHAR(64),
"user_login" VARCHAR(256),
"body" TEXT,
"created_at" TIMESTAMP_NTZ,
"label" VARCHAR(50) -- this is our category column
);
-- SQL here for convenience to backup the contributor_stats table
-- BEGIN TRANSACTION;
-- -- Empty the destination table
-- DELETE FROM "contributor_stats_backup";
-- -- Copy contents from the source table to the destination table
-- INSERT INTO "contributor_stats_backup"
-- SELECT * FROM "contributor_stats";
-- COMMIT;