forked from tomclegg/get-evidence
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimport_gwas.php
executable file
·381 lines (321 loc) · 11.3 KB
/
import_gwas.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/usr/bin/php
<?php
;
// Copyright: see COPYING
// Authors: see git-blame(1)
if ($_SERVER["argc"] != 2)
{
die ("Usage: ".$_SERVER["argv"][0]." gwas.csv\n");
}
chdir ('public_html');
require_once 'lib/setup.php';
require_once 'lib/genomes.php';
require_once 'lib/openid.php';
require_once 'lib/bp.php';
ini_set ("output_buffering", FALSE);
ini_set ("memory_limit", 67108864);
print "Creating/updating get-evidence tables...";
genomes_create_tables ();
print "\n";
openid_login_as_robot ("GWAS Importing Robot");
$fh = fopen ($_SERVER["argv"][1], "r");
$columns = fgets ($fh);
fclose ($fh);
print "Creating temporary table...";
if (eregi ('^date added[^,]*,pubmedid,first author,date,', $columns)) {
// Date Added to Catalog (since 11/25/08),PubMedID,First Author,Date,Journal,Link,Study,Disease/Trait,Initial Sample Size,Replication Sample Size,Region,Reported Gene(s),Strongest SNP-Risk Allele,SNPs,Risk Allele Frequency,p-Value,p-Value (text),OR or beta,95% CI (text),Platform [SNPs passing QC],CNV
$inputformat = "genome.gov";
$q = theDb()->query ("CREATE TEMPORARY TABLE gwas (
`date_added` date,
`pmid` int unsigned,
`first_author` varchar(64),
`pub_date` VARCHAR(32),
`journal` varchar(64),
`url` varchar(255),
`study` varchar(255),
`disease_trait` varchar(255),
`initial_sample_size` VARCHAR(255),
`replication_sample_size` VARCHAR(255),
`region` VARCHAR(255),
`genes` VARCHAR(32),
`risk_allele` VARCHAR(32),
`snps` VARCHAR(32),
`risk_allele_frequency` VARCHAR(16),
`p_value` VARCHAR(32),
`p_value_text` VARCHAR(32),
`or_or_beta` VARCHAR(32),
`ci_95_text` VARCHAR(32),
`platform_SNPs_passing_QC` VARCHAR(32),
`cnv` CHAR(1),
INDEX(`snps`)
)");
}
else if (eregi ('^rs[^,]*, *gene *, *gene/region *, *trait', $columns)) {
// rs Number(region location) , Gene , Gene/Region , Trait , First Author , Journal , Published Year , PubMed ID , Sample Size (Initial/Replicate) , Risk Allele [Prevalence in control] , OR/Beta [95% CI] , p-Value , platform ,OR
$inputformat = "hugenet";
$q = theDb()->query ("CREATE TEMPORARY TABLE gwas (
`snps` VARCHAR(32),
`genes` VARCHAR(32),
`region` VARCHAR(255),
`disease_trait` varchar(255),
`first_author` varchar(64),
`journal` varchar(64),
`pub_date` VARCHAR(32),
`pmid` int unsigned,
`sample_size` VARCHAR(255),
`risk_allele` VARCHAR(32),
`or_or_beta` VARCHAR(32),
`p_value` VARCHAR(32),
`platform_SNPs_passing_QC` VARCHAR(32),
`or_or_beta_is_or` CHAR(1),
INDEX(`snps`)
)");
}
else {
die ("Unrecognized input format");
}
if (theDb()->isError($q)) print $q->getMessage;
print "\n";
print "Importing data...";
$q = theDb()->query ("LOAD DATA LOCAL INFILE ? INTO TABLE gwas
FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES",
array ($_SERVER["argv"][1]));
if (theDb()->isError($q)) print $q->getMessage();
print theDb()->affectedRows();
print "\n";
print "Discarding multi-SNP records...";
theDb()->query ("DELETE FROM gwas WHERE snps LIKE '%,%'");
print theDb()->affectedRows();
print "\n";
print "Discarding records without rsid...";
theDb()->query ("DELETE FROM gwas WHERE snps NOT LIKE 'rs%'");
print theDb()->affectedRows();
print "\n";
print "Discarding trailing stuff in parens after rsid...";
theDb()->query ("UPDATE gwas
SET snps=SUBSTRING(snps,1,LOCATE('(',snps)-1)
WHERE snps LIKE '%(%'");
print theDb()->affectedRows();
print "\n";
if ($inputformat == "hugenet") {
print "Splitting sample_size into initial_ and replication_sample_size fields...";
theDb()->query ("ALTER TABLE gwas ADD initial_sample_size VARCHAR(255)");
theDb()->query ("ALTER TABLE gwas ADD replication_sample_size VARCHAR(255)");
theDb()->query ("
UPDATE gwas
SET initial_sample_size = SUBSTRING(sample_size,1,LOCATE('/',sample_size)-1),
replication_sample_size = SUBSTRING(sample_size,LOCATE('/',sample_size)+1)
");
print theDb()->affectedRows();
print "\n";
print "Splitting risk_allele into risk_allele_frequency field...";
theDb()->query ("ALTER TABLE gwas ADD risk_allele_frequency VARCHAR(16)");
theDb()->query ("
UPDATE gwas
SET risk_allele_frequency = SUBSTRING(risk_allele,LOCATE('[',risk_allele)+1)
");
print theDb()->affectedRows();
print "...";
theDb()->query ("
UPDATE gwas
SET risk_allele_frequency = SUBSTRING(risk_allele_frequency,1,LOCATE(']',risk_allele_frequency)-1),
risk_allele = SUBSTRING(risk_allele,1,LOCATE('[',risk_allele)-1)
");
print theDb()->affectedRows();
print "\n";
print "Adding p_value_text field...";
theDb()->query ("ALTER TABLE gwas ADD p_value_text VARCHAR(16) DEFAULT ''");
print theDb()->affectedRows();
print "\n";
print "Splitting CI interval out of or_or_beta field...";
theDb()->query ("ALTER TABLE gwas ADD ci_95_text VARCHAR(32)");
theDb()->query ("
UPDATE gwas
SET ci_95_text = SUBSTRING(or_or_beta,LOCATE('[[',or_or_beta)+1)
");
print theDb()->affectedRows();
print "...";
theDb()->query ("
UPDATE gwas
SET ci_95_text = SUBSTRING(ci_95_text,1,LOCATE(']]',ci_95_text)),
or_or_beta = SUBSTRING(or_or_beta,1,LOCATE('[[',or_or_beta)-1)
");
print theDb()->affectedRows();
print "\n";
print "Adding URL field...";
theDb()->query ("ALTER TABLE gwas ADD url VARCHAR(255)");
theDb()->query ("
UPDATE gwas
SET url=CONCAT('http://www.ncbi.nlm.nih.gov/pubmed/',pmid)
");
print theDb()->affectedRows();
print "\n";
}
print "Splitting risk-allele into allele field...";
theDb()->query ("ALTER TABLE gwas ADD allele CHAR(1)");
theDb()->query ("
UPDATE gwas
SET allele = UPPER(SUBSTRING(risk_allele,LOCATE('-',risk_allele)+1,1))
");
print theDb()->affectedRows();
print "\n";
print "Fixing up numeric fields...";
theDb()->query ("
UPDATE gwas
SET risk_allele_frequency = IF(risk_allele_frequency IN ('NR','Pending'),NULL,risk_allele_frequency),
p_value = IF(p_value IN ('NS'),NULL,p_value),
p_value_text = REPLACE(p_value_text,char(255),'')
");
print theDb()->affectedRows();
theDb()->query ("
ALTER TABLE gwas
CHANGE risk_allele_frequency risk_allele_frequency DECIMAL(3,2)
");
print "\n";
if ($inputformat == "genome.gov") {
print "Removing <FF> chars from p_value_text...";
theDb()->query ("
UPDATE gwas
SET p_value_text = REPLACE(p_value_text,char(255),'')
");
print theDb()->affectedRows();
print "\n";
}
print "Looking up rsid,allele -> variant_id via existing evidence...";
theDb()->query ("ALTER TABLE gwas ADD variant_id BIGINT UNSIGNED");
theDb()->query ("ALTER TABLE gwas ADD INDEX(variant_id)");
$q = theDb()->query ("
UPDATE gwas g
LEFT JOIN variant_occurs o
ON o.rsid = substr(g.snps,3,99)
SET g.variant_id=o.variant_id
");
if (theDb()->isError($q)) print $q->getMessage();
print theDb()->affectedRows();
print "\n";
print "Looking up rsid,allele -> gene_aa via variant_locations...";
theDb()->query ("ALTER TABLE gwas ADD gene_aa VARCHAR(32)");
theDb()->query ("ALTER TABLE gwas ADD INDEX(gene_aa)");
$q = theDb()->query ("
UPDATE gwas g
LEFT JOIN variant_locations l
ON l.rsid = substr(g.snps,3,99)
AND l.allele = g.allele
SET g.gene_aa=l.gene_aa
WHERE g.variant_id IS NULL
");
if (theDb()->isError($q)) print $q->getMessage();
print theDb()->affectedRows();
print "\n";
print "Adding serial number...";
theDb()->query ("ALTER TABLE gwas ADD gwas_id SERIAL");
print theDb()->affectedRows();
print "\n";
print "Adding variants...";
$q = theDb()->query ("
SELECT gene_aa, snps, gwas_id
FROM gwas
WHERE variant_id IS NULL");
$n=0;
$did = array();
while ($row =& $q->fetchRow())
{
$variant_name = $row["gene_aa"] ? $row["gene_aa"] : $row["snps"];
if (isset ($did[$variant_name])) {
$variant_id = $did[$variant_name];
theDb()->query ("UPDATE gwas SET variant_id=? WHERE gwas_id=?",
array ($variant_id, $row["gwas_id"]));
continue;
}
// create the variant, and an "initial edit/add" row in edits table
$variant_id = evidence_get_variant_id ($variant_name,
false, false, false,
true);
$edit_id = evidence_get_latest_edit ($variant_id,
0, 0, 0,
true, array ("variant_impact" => "pathogenic"));
$did[$variant_name] = $variant_id;
theDb()->query ("UPDATE gwas SET variant_id=? WHERE gwas_id=?",
array ($variant_id, $row["gwas_id"]));
if ($row["gene_aa"])
theDb()->query ("UPDATE variant_locations SET variant_id=? WHERE gene_aa=?",
array ($variant_id, $row["gene_aa"]));
++$n;
if ($n % 100 == 0)
print "$n...";
}
print "$n\n";
print "Adding diseases...";
theDb()->query ("INSERT IGNORE INTO diseases (disease_name) SELECT disease_trait FROM gwas");
print theDb()->affectedRows();
print "\n";
print "Looking up disease IDs...";
theDb()->query ("ALTER TABLE gwas ADD disease_id BIGINT NOT NULL");
theDb()->query ("UPDATE gwas
LEFT JOIN diseases d
ON gwas.disease_trait = d.disease_name
SET gwas.disease_id = d.disease_id");
print theDb()->affectedRows();
print "\n";
theDb()->query ("UNLOCK TABLES");
print "Updating variant_disease...";
theDb()->query ("LOCK TABLES variant_disease WRITE");
theDb()->query ("DELETE FROM variant_disease WHERE dbtag='GWAS'");
theDb()->query ("INSERT IGNORE INTO variant_disease
(variant_id, disease_id, dbtag)
SELECT variant_id, disease_id, 'GWAS'
FROM gwas
WHERE variant_id > 0 AND disease_id > 0");
print theDb()->affectedRows();
print "\n";
theDb()->query ("UNLOCK TABLES");
print "Updating variant_external...";
theDb()->query ("LOCK TABLES variant_external WRITE");
theDb()->query ("DELETE FROM variant_external WHERE tag='GWAS'");
$q = theDb()->query ("INSERT INTO variant_external
(variant_id, tag, content, url, updated)
SELECT variant_id, 'GWAS', CONCAT(disease_trait,' (',risk_allele,')\n',first_author,' ',pub_date,' in ',journal,'\nOR or beta: ',or_or_beta,' ',ci_95_text,IF(risk_allele_frequency is null,'',CONCAT('\nRisk allele frequency: ',risk_allele_frequency)),IF(p_value is null,'',CONCAT('\np-value: ',p_value,' ',p_value_text)),'\nInitial sample: ',initial_sample_size,'\nReplication sample: ',replication_sample_size), url, NOW()
FROM gwas
WHERE variant_id > 0");
if (theDb()->isError($q)) print "[".$q->getMessage()."]";
print theDb()->affectedRows();
print "\n";
theDb()->query ("UNLOCK TABLES");
if ($inputformat == "genome.gov") {
print "Adding or_or_beta_is_or column...";
theDb()->query ("ALTER TABLE gwas ADD or_or_beta_is_or CHAR(1)");
theDb()->query ("UPDATE gwas
SET or_or_beta_is_or=IF(or_or_beta IS NOT NULL
AND or_or_beta <> 'NR'
AND (ci_95_text LIKE '%]'
OR ci_95_text LIKE '%] (%)')
AND ci_95_text NOT LIKE '%]%]','Y','N')");
print theDb()->affectedRows();
print "\n";
}
print "Adding/updating gwas_max_or column in variants table...";
theDb()->query ("ALTER TABLE variants ADD gwas_max_or DECIMAL(6,3)");
theDb()->query ("CREATE TEMPORARY TABLE gwas_or_tmp
AS SELECT variant_id, MAX(or_or_beta) or_or_beta
FROM gwas
WHERE variant_id IS NOT NULL AND or_or_beta_is_or='Y'
GROUP BY variant_id");
print theDb()->affectedRows();
print "...";
$q = theDb()->query ("UPDATE gwas_or_tmp
LEFT JOIN variants
ON variants.variant_id=gwas_or_tmp.variant_id
SET variants.gwas_max_or=or_or_beta
");
if (theDb()->isError($q)) print "[".$q->getMessage()."]";
print theDb()->affectedRows();
print "\n";
if (getenv("DEBUG"))
{
theDb()->query ("DROP TABLE IF EXISTS gwas_last");
theDb()->query ("CREATE TABLE IF NOT EXISTS gwas_last LIKE gwas");
theDb()->query ("INSERT INTO gwas_last SELECT * FROM gwas");
}
?>