forked from wym6912/sreformat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
msf.c
391 lines (349 loc) · 12.3 KB
/
msf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
/*****************************************************************
* HMMER - Biological sequence analysis with profile HMMs
* Copyright (C) 1992-2003 Washington University School of Medicine
* All Rights Reserved
*
* This source code is distributed under the terms of the
* GNU General Public License. See the files COPYING and LICENSE
* for details.
*****************************************************************/
/* msf.c
* SRE, Sun Jul 11 16:17:32 1993
*
* Import/export of GCG MSF multiple sequence alignment
* formatted files. Designed using format specifications
* kindly provided by Steve Smith of Genetics Computer Group.
*
* CVS $Id: msf.c,v 1.6 2003/04/14 16:00:16 eddy Exp $
*/
#include "squidconf.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include "squid.h"
#include "msa.h"
#ifdef TESTDRIVE_MSF
/*****************************************************************
* msf.c test driver:
* cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
*
*/
int
main(int argc, char **argv)
{
MSAFILE *afp;
MSA *msa;
char *file;
file = argv[1];
if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
Die("Couldn't open %s\n", file);
while ((msa = ReadMSF(afp)) != NULL)
{
WriteMSF(stdout, msa);
MSAFree(msa);
}
MSAFileClose(afp);
exit(0);
}
/******************************************************************/
#endif /* testdrive_msf */
/* Function: ReadMSF()
* Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
*
* Purpose: Parse an alignment read from an open MSF format
* alignment file. (MSF is a single-alignment format.)
* Return the alignment, or NULL if we've already
* read the alignment.
*
* Args: afp - open alignment file
*
* Returns: MSA * - an alignment object
* caller responsible for an MSAFree()
* NULL if no more alignments
*
* Diagnostics:
* Will Die() here with a (potentially) useful message
* if a parsing error occurs.
*/
MSA *
ReadMSF(MSAFILE *afp)
{
MSA *msa;
char *s;
int alleged_alen;
int alleged_type;
int alleged_checksum;
char *tok;
char *sp;
int slen;
int sqidx;
char *name;
char *seq;
if (feof(afp->f)) return NULL;
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
/* The first line is the header.
* This is a new-ish GCG feature. Don't count on it, so
* we can be a bit more tolerant towards non-GCG software
* generating "MSF" files.
*/
msa = MSAAlloc(10, 0);
if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
msa->type = kAmino;
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
} else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
msa->type = kRNA;
if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
}
/* Now we're in the free text comment section of the MSF file.
* It ends when we see the "MSF: Type: Check: .." line.
* This line must be present.
*/
do
{
if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
{
alleged_alen = atoi(sqd_parse[0]);
switch (*(sqd_parse[1])) {
case 'N' : alleged_type = kRNA; break;
case 'P' : alleged_type = kAmino; break;
case 'X' : alleged_type = kOtherSeq; break;
default : alleged_type = kOtherSeq;
}
alleged_checksum = atoi(sqd_parse[3]);
if (msa->type == kOtherSeq) msa->type = alleged_type;
break; /* we're done with comment section. */
}
if (! IsBlankline(s))
MSAAddComment(msa, s);
} while ((s = MSAFileGetLine(afp)) != NULL);
/* Now we're in the name section.
* GCG has a relatively poorly documented feature: only sequences that
* appear in this list will be read from the alignment section. Commenting
* out sequences in the name list (by preceding them with "!") is
* allowed as a means of manually defining subsets of sequences in
* the alignment section. We can support this feature reasonably
* easily because of the hash table for names in the MSA: we
* only add names to the hash table when we see 'em in the name section.
*/
while ((s = MSAFileGetLine(afp)) != NULL)
{
while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
if (*s == '\n') continue; /* skip blank lines */
else if (*s == '!') MSAAddComment(msa, s);
else if ((sp = strstr(s, "Name:")) != NULL)
{
/* We take the name and the weigh, and that's it */
sp += 5;
tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
sqidx = GKIStoreKey(msa->index, tok);
if (sqidx >= msa->nseqalloc) MSAExpand(msa);
msa->sqname[sqidx] = sre_strdup(tok, slen);
msa->nseq++;
if ((sp = strstr(sp, "Weight:")) == NULL)
Die("No Weight: on line %d for %s in name section of MSF file %s\n",
afp->linenumber, msa->sqname[sqidx], afp->fname);
sp += 7;
tok = sre_strtok(&sp, " \t", &slen);
msa->wgt[sqidx] = atof(tok);
msa->flags |= MSA_SET_WGT;
}
else if (strncmp(s, "//", 2) == 0)
break;
else
{
Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
afp->linenumber, afp->fname, s);
squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
return NULL;
}
}
/* And now we're in the sequence section.
* As discussed above, if we haven't seen a sequence name, then we
* don't include the sequence in the alignment.
* Also, watch out for coordinate-only lines.
*/
while ((s = MSAFileGetLine(afp)) != NULL)
{
sp = s;
if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
/* The test for a coord line: digits starting both fields
*/
if (isdigit((int) *name) && isdigit((int) *seq))
continue;
/* It's not blank, and it's not a coord line: must be sequence
*/
sqidx = GKIKeyIndex(msa->index, name);
if (sqidx < 0) continue; /* not a sequence we recognize */
msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
}
/* We've left blanks in the aseqs; take them back out.
*/
for (sqidx = 0; sqidx < msa->nseq; sqidx++)
{
if (msa->aseq[sqidx] == NULL)
Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
{
if (*s == ' ' || *s == '\t') {
msa->sqlen[sqidx]--;
} else {
*sp = *s;
sp++;
}
}
*sp = '\0';
}
MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
return msa;
}
/* Function: WriteMSF()
* Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
*
* Purpose: Write an alignment in MSF format to an open file.
*
* Args: fp - file that's open for writing.
* msa - alignment to write.
*
* Note that msa->type, usually optional, must be
* set for WriteMSF to work. If it isn't, a fatal
* error is generated.
*
* Returns: (void)
*/
void
WriteMSF(FILE *fp, MSA *msa)
{
time_t now; /* current time as a time_t */
char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
char **gcg_sqname; /* sequence names with GCG-valid character sets */
int idx; /* counter for sequences */
char *s; /* pointer into sqname or seq */
int len; /* tmp variable for name lengths */
int namelen; /* maximum name length used */
int pos; /* position counter */
char buffer[51]; /* buffer for writing seq */
int i; /* another position counter */
/*****************************************************************
* Make copies of sequence names and sequences.
* GCG recommends that name characters should only contain
* alphanumeric characters, -, or _
* Some GCG and GCG-compatible software is sensitive to this.
* We silently convert all other characters to '_'.
*
* For sequences, GCG allows only ~ and . for gaps.
* Otherwise, everthing is interpreted as a residue;
* so squid's IUPAC-restricted chars are fine. ~ means
* an external gap. . means an internal gap.
*****************************************************************/
/* make copies that we can edit */
gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
for (idx = 0; idx < msa->nseq; idx++)
{
gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
}
/* alter names as needed */
for (idx = 0; idx < msa->nseq; idx++)
for (s = gcg_sqname[idx]; *s != '\0'; s++)
if (! isalnum((int) *s) && *s != '-' && *s != '_')
*s = '_';
/* alter gap chars in seq */
for (idx = 0; idx < msa->nseq; idx++)
{
for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
*s = '~';
for (; *s != '\0'; s++)
if (isgap(*s)) *s = '.';
for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
gcg_aseq[idx][pos] = '~';
}
/* calculate max namelen used */
namelen = 0;
for (idx = 0; idx < msa->nseq; idx++)
if ((len = strlen(msa->sqname[idx])) > namelen)
namelen = len;
/*****************************************************
* Write the MSF header
*****************************************************/
/* required file type line */
if (msa->type == kOtherSeq)
msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
else if (msa->type == kOtherSeq)
Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
else
Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
/* free text comments */
if (msa->ncomment > 0)
{
for (idx = 0; idx < msa->ncomment; idx++)
fprintf(fp, "%s\n", msa->comment[idx]);
fprintf(fp, "\n");
}
/* required checksum line */
now = time(NULL);
if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
Die("What time is it on earth? strftime() failed in WriteMSF().\n");
fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
msa->name != NULL ? msa->name : "squid.msf",
msa->alen,
msa->type == kRNA ? 'N' : 'P',
date,
GCGMultchecksum(gcg_aseq, msa->nseq));
fprintf(fp, "\n");
/*****************************************************
* Names/weights section
*****************************************************/
for (idx = 0; idx < msa->nseq; idx++)
{
fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
namelen, namelen,
gcg_sqname[idx],
msa->alen,
GCGchecksum(gcg_aseq[idx], msa->alen),
msa->wgt[idx]);
}
fprintf(fp, "\n");
fprintf(fp, "//\n");
/*****************************************************
* Write the sequences
*****************************************************/
for (pos = 0; pos < msa->alen; pos += 50)
{
fprintf(fp, "\n"); /* Blank line between sequence blocks */
/* Coordinate line */
len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
if (len > 10)
fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
pos+1,
len + ((len-1)/10) - 12, "",
pos + len);
else
fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
for (idx = 0; idx < msa->nseq; idx++)
{
fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
/* get next line's worth of 50 from seq */
strncpy(buffer, gcg_aseq[idx] + pos, 50);
buffer[50] = '\0';
/* draw the sequence line */
for (i = 0; i < len; i++)
{
if (! (i % 10)) fputc(' ', fp);
fputc(buffer[i], fp);
}
fputc('\n', fp);
}
}
Free2DArray((void **) gcg_aseq, msa->nseq);
Free2DArray((void **) gcg_sqname, msa->nseq);
return;
}