diff --git a/README.md b/README.md index 3202051..a324ca6 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,11 @@ Modified by wym6912 wym6912@outlook.com +## Updates (v1.4.1.2) + +- Fix bug in outputting RNA sequences in `v1.4.1.1` +- Add `BLOSUM62` matrix arugment (`-B/--blosum`) + ## Updates (v1.4.1.1) - Add Windows compile support in `Visual Studio 2022` diff --git a/abPOA.rc b/abPOA.rc index e95f8ac..48b2c8a 100644 Binary files a/abPOA.rc and b/abPOA.rc differ diff --git a/include/abpoa.h b/include/abpoa.h index 87bbdcf..e24935a 100644 --- a/include/abpoa.h +++ b/include/abpoa.h @@ -75,6 +75,7 @@ typedef struct { int align_mode, gap_mode, max_n_cons; double min_freq; // for multiploid data int verbose; // to control output msg + int has_u; // detect RNA sequences // char LogTable65536[65536]; // char bit_table16[65536]; diff --git a/src/abpoa.c b/src/abpoa.c index 93e46b5..8b18ae8 100644 --- a/src/abpoa.c +++ b/src/abpoa.c @@ -20,7 +20,7 @@ char PROG[20] = "abpoa"; #define _bO BOLD UNDERLINE "O" NONE #define _bA BOLD UNDERLINE "A" NONE char DESCRIPTION[200] = _ba "daptive " _bb "anded " _bP "artial " _bO "rder " _bA "lignment"; -char VERSION[20] = "1.4.1.1"; +char VERSION[20] = "1.4.1.2"; char CONTACT[30] = "gaoy1@chop.edu"; const struct option abpoa_long_opt [] = { @@ -55,6 +55,7 @@ const struct option abpoa_long_opt [] = { { "out-pog", 1, NULL, 'g' }, { "max-num-cons", 1, NULL, 'd', }, { "min-freq", 1, NULL, 'q', }, + { "blosum", 0, NULL, 'B', }, { "help", 0, NULL, 'h' }, { "version", 0, NULL, 'v' }, @@ -115,6 +116,7 @@ int abpoa_usage(void) err_printf(" Input/Output:\n"); err_printf(" -Q --use-qual-weight take base quality score from FASTQ input file as graph edge weight [False]\n"); err_printf(" -c --amino-acid input sequences are amino acid (default is nucleotide) [False]\n"); + err_printf(" -B --blosum use BLOSUM62 for protein alignment [False]\n"); err_printf(" -l --in-list input file is a list of sequence file names [False]\n"); err_printf(" each line is one sequence file containing a set of sequences\n"); err_printf(" which will be aligned by abPOA to generate a consensus sequence\n"); @@ -161,7 +163,7 @@ int abpoa_main(char *file_fn, int is_list, abpoa_para_t *abpt){ int main(int argc, char **argv) { int c, m, in_list=0; char *s; abpoa_para_t *abpt = abpoa_init_para(); - while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpso:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "m:M:X:t:O:E:b:f:z:e:QSk:w:n:i:clpsBo:r:g:d:q:hvV:", abpoa_long_opt, NULL)) >= 0) { switch(c) { case 'm': m = atoi(optarg); @@ -203,6 +205,7 @@ int main(int argc, char **argv) { else err_printf("Error: unknown output result mode: %s.\n", optarg); break; case 'g': abpt->out_pog= strdup(optarg); break; + case 'B': abpt->use_score_matrix = 2; break; case 'd': abpt->max_n_cons = atoi(optarg); break; case 'q': abpt->min_freq = atof(optarg); break; diff --git a/src/abpoa.h b/src/abpoa.h index 87bbdcf..e24935a 100644 --- a/src/abpoa.h +++ b/src/abpoa.h @@ -75,6 +75,7 @@ typedef struct { int align_mode, gap_mode, max_n_cons; double min_freq; // for multiploid data int verbose; // to control output msg + int has_u; // detect RNA sequences // char LogTable65536[65536]; // char bit_table16[65536]; diff --git a/src/abpoa_align.c b/src/abpoa_align.c index 6eee392..43b0c44 100644 --- a/src/abpoa_align.c +++ b/src/abpoa_align.c @@ -30,6 +30,7 @@ extern char ab_aa26_table[256]; extern char ab_aa256_table[256]; extern char ab_char26_table[256]; extern char ab_char256_table[256]; +extern char ab_blosum62[][100]; void parse_mat_first_line(char *l, int *order) { int i, n; @@ -84,6 +85,40 @@ void abpoa_set_mat_from_file(abpoa_para_t *abpt, char *mat_fn) { free(l); free(order); fclose(fp); } +void abpoa_set_mat_from_var(abpoa_para_t *abpt, char** matrix, int abpt_sz) +{ + char *this_line = matrix[0]; + int first_line = 1; + if(abpt_sz != abpt->m) + { + err_func_printf(__func__, "Warning: matrix length %d is different with default setting (%d). Use default matrix", abpt->m, abpt_sz); + return; + } + int *order = (int*)_err_malloc(abpt_sz * sizeof(int)); + for(; strlen(this_line) > 0; ++ this_line) + { + if(first_line) + { + first_line = 0; + // get string bases + parse_mat_first_line(this_line, order); + } + else + { + // get match/mismatch scores + parse_mat_score_line(this_line, order, abpt->m, abpt->mat); + } + } + int i; abpt->min_mis = 0, abpt->max_mat = 0; + for (i = 0; i < abpt->m * abpt->m; ++i) { + if (abpt->mat[i] > abpt->max_mat) + abpt->max_mat = abpt->mat[i]; + if (-abpt->mat[i] > abpt->min_mis) + abpt->min_mis = -abpt->mat[i]; + } + free(order); +} + void abpoa_set_gap_mode(abpoa_para_t *abpt) { if (abpt->gap_open1 == 0) abpt->gap_mode = ABPOA_LINEAR_GAP; else if (abpt->gap_open1 > 0 && abpt->gap_open2 == 0) abpt->gap_mode = ABPOA_AFFINE_GAP; @@ -115,6 +150,7 @@ abpoa_para_t *abpoa_init_para(void) { // number of residue types abpt->m = 5; // nucleotide abpt->mat = (int*)_err_malloc(abpt->m * abpt->m * sizeof(int)); + abpt->has_u = 0; // DNA sequence // score matrix abpt->use_score_matrix = 0; @@ -164,7 +200,14 @@ void abpoa_post_set_para(abpoa_para_t *abpt) { } } if (abpt->use_score_matrix == 0) gen_simple_mat(abpt); - else abpoa_set_mat_from_file(abpt, abpt->mat_fn); + else if(abpt->use_score_matrix == 1) abpoa_set_mat_from_file(abpt, abpt->mat_fn); + else if(abpt->use_score_matrix == 2) + { + _err_simple_func_printf("Try to use BLOSUM62 matrix"); + abpoa_set_mat_from_var(abpt, (char**)ab_blosum62, 26); + + } + else err_fatal_core(__func__, "score matrix = %d, which is unsupported.\n", abpt->use_score_matrix); } void abpoa_free_para(abpoa_para_t *abpt) { @@ -443,7 +486,7 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) { abpoa_seq_t *abs = ab->abs; int exist_n_seq = abs->n_seq; // read seq from read_fn - FILE* readfp = fopen(read_fn, "r"); kseq_t *ks = kseq_init(fileno(readfp)); + FILE* readfp = xopen(read_fn, "r"); kseq_t *ks = kseq_init(fileno(readfp)); int i, j, n_seq = abpoa_read_seq(abs, ks); // always reset graph before perform POA @@ -452,21 +495,46 @@ int abpoa_msa1(abpoa_t *ab, abpoa_para_t *abpt, char *read_fn, FILE *out_fp) { if (abs->seq[i].l > max_len) max_len = abs->seq[i].l; } + // detect u and t + int has_st = 0, has_su = 0; + // set seqs, seq_lens extern char ab_char26_table[256]; uint8_t **seqs = (uint8_t**)_err_malloc(n_seq * sizeof(uint8_t*)); int *seq_lens = (int*)_err_malloc(n_seq * sizeof(int)); int **weights = (int**)_err_malloc(n_seq * sizeof(int*)); + char ch; for (i = 0; i < n_seq; ++i) { seq_lens[i] = abs->seq[exist_n_seq+i].l; seqs[i] = (uint8_t*)_err_malloc(sizeof(uint8_t) * seq_lens[i]); weights[i] = (int*)_err_malloc(sizeof(int) * seq_lens[i]); - for (j = 0; j < seq_lens[i]; ++j) seqs[i][j] = ab_char26_table[(int)abs->seq[exist_n_seq+i].s[j]]; + for (j = 0; j < seq_lens[i]; ++j) + { + ch = abs->seq[exist_n_seq+i].s[j]; + seqs[i][j] = ab_char26_table[(int)ch]; + if(ch == 'U' || ch == 'u') ++ has_su; + else if(ch == 'T' || ch == 't') ++ has_st; + } if (abpt->use_qv && abs->qual[exist_n_seq+i].l > 0) { for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = (int)abs->qual[exist_n_seq+i].s[j]-32; } else { for (j = 0; j < seq_lens[i]; ++j) weights[i][j] = 1; } } + if(abpt->m == 5) // nuc + { + if(has_st && has_su) err_fatal_core(__func__, "DNA sequence has U and T. Please check the sequences, use --amino-acid or -c if sequences are protein.\n"); + else if(has_su) + { + _err_simple_func_printf("detected RNA sequences"); + abpt->has_u = 1; + } + else + { + _err_simple_func_printf("detected DNA sequences"); + abpt->has_u = 0; + } + } + else _err_simple_func_printf("detected Protein sequences\n"); if ((abpt->disable_seeding && abpt->progressive_poa==0) || abpt->align_mode != ABPOA_GLOBAL_MODE) { abpoa_poa(ab, abpt, seqs, weights, seq_lens, exist_n_seq, n_seq); } else { diff --git a/src/abpoa_output.c b/src/abpoa_output.c index 0fc4fb9..7ede4a0 100644 --- a/src/abpoa_output.c +++ b/src/abpoa_output.c @@ -70,8 +70,7 @@ abpoa_cons_t *abpoa_allocate_rc_msa(abpoa_cons_t *abc, int msa_len, int n_seq, i void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { if (out_fp == NULL) return; // print U in RNA - short changedU = (ab_char256_table['U'] != 'U'); char rawU = ab_char256_table['U']; - if(changedU) ab_char256_table['U'] = 'U'; + if(abpt->has_u) ab_char256_table[3] = 'U'; int i, j; abpoa_seq_t *abs = ab->abs; abpoa_cons_t *abc = ab->abc; if (abc->msa_len <= 0) return; @@ -101,7 +100,7 @@ void abpoa_output_rc_msa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { fprintf(out_fp, "\n"); } } - ab_char256_table['U'] = rawU; // revert change + ab_char256_table[3] = 'T'; // revert change } void abpoa_set_msa_seq(abpoa_node_t node, int rank, uint8_t **msa_base) { @@ -190,6 +189,8 @@ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { kdq_int_t *q = kdq_init_int(); + if(abpt->has_u) ab_char256_table[3] = 'U'; + // Breadth-First-Search kdq_push_int(q, ABPOA_SRC_NODE_ID); while ((id = kdq_shift_int(q)) != 0) { @@ -269,6 +270,7 @@ void abpoa_generate_gfa(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { free(in_degree); for (i = 0; i < n_seq; ++i) free(read_paths[i]); free(read_paths); free(read_path_i); + ab_char256_table[3] = 'T'; } int abpoa_cons_phred_score(int n_cov, int n_seq) { @@ -498,6 +500,7 @@ void abpoa_multip_heaviest_bundling(abpoa_graph_t *abg, abpoa_para_t *abpt, int void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { if (out_fp == NULL) return; + if(abpt->has_u) ab_char256_table[3] = 'U'; int cons_i, j; abpoa_cons_t *abc = ab->abc; for (cons_i = 0; cons_i < abc->n_cons; ++cons_i) { @@ -529,6 +532,7 @@ void abpoa_output_fx_consensus(abpoa_t *ab, abpoa_para_t *abpt, FILE *out_fp) { } fprintf(out_fp, "\n"); } } + ab_char256_table[3] = 'T'; } abpoa_cons_t *abpoa_allocate_cons(abpoa_cons_t *abc, int n_node, int n_seq, int n_cons) { diff --git a/src/abpoa_seq.c b/src/abpoa_seq.c index 3f1bdc1..997e82b 100644 --- a/src/abpoa_seq.c +++ b/src/abpoa_seq.c @@ -97,6 +97,38 @@ const char ab_aa256_table[256] = { char ab_char26_table[256]; char ab_char256_table[256]; +char ab_blosum62[][100] = { + {"A R N D C Q E G H I L K M F P S T W Y V B J Z X * O U"}, + {"A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 -1 -1 -4 -4 -4"}, + {"R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 -2 0 -1 -4 -4 -4"}, + {"N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 4 -3 0 -1 -4 -4 -4"}, + {"D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 -3 1 -1 -4 -4 -4"}, + {"C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -1 -3 -1 -4 -4 -4"}, + {"Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -2 4 -1 -4 -4 -4"}, + {"E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 -3 4 -1 -4 -4 -4"}, + {"G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -4 -2 -1 -4 -4 -4"}, + {"H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -3 0 -1 -4 -4 -4"}, + {"I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 3 -3 -1 -4 -4 -4"}, + {"L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 3 -3 -1 -4 -4 -4"}, + {"K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -3 1 -1 -4 -4 -4"}, + {"M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 2 -1 -1 -4 -4 -4"}, + {"F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 0 -3 -1 -4 -4 -4"}, + {"P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -3 -1 -1 -4 -4 -4"}, + {"S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -2 0 -1 -4 -4 -4"}, + {"T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 -1 -1 -4 -4 -4"}, + {"W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -2 -2 -1 -4 -4 -4"}, + {"Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -1 -2 -1 -4 -4 -4"}, + {"V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 2 -2 -1 -4 -4 -4"}, + {"B -2 -1 4 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 -3 0 -1 -4 -4 -4"}, + {"J -1 -2 -3 -3 -1 -2 -3 -4 -3 3 3 -3 2 0 -3 -2 -1 -2 -1 2 -3 3 -3 -1 -4 -4 -4"}, + {"Z -1 0 0 1 -3 4 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -2 -2 -2 0 -3 4 -1 -4 -4 -4"}, + {"X -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -4 -4 -4"}, + {"* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 -4 -4"}, + {"O -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 -4"}, + {"U -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1"}, + {""} +}; + abpoa_seq_t *abpoa_init_seq(void) { abpoa_seq_t *abs = (abpoa_seq_t*)_err_malloc(sizeof(abpoa_seq_t)); abs->n_seq = 0; abs->m_seq = CHUNK_READ_N;